commit f91881ffdc051ab49314b1bd12c4a07a862dc9c6 Author: Megvii Engine Team Date: Fri Feb 14 16:53:50 2020 +0800 MegEngine: Initial commit of MegEngine. GitOrigin-RevId: f0c8338beb9cac953bd2d8b76710790940dc9300 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..da7d1ec4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +# Mark generated files as binary, ignore them in git diff. +# dnn +dnn/src/cuda/conv_bias/int8/kimpl/* binary +dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary +dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary diff --git a/.github/ISSUE_TEMPLATE/bug-issue.md b/.github/ISSUE_TEMPLATE/bug-issue.md new file mode 100644 index 00000000..d2e10dd4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-issue.md @@ -0,0 +1,25 @@ +--- +name: Bug Issue +about: 请使用此模板提出您遇到的问题 +title: BUG Issue +labels: '' +assignees: '' + +--- + + +## 环境 +1.系统环境: +2.MegEngine版本: +3.python版本: + +## 复现步骤 +1. +2. +3. + +## 请提供关键的代码片段便于追查问题 + + + +## 请提供完整的日志及报错信息 diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.md b/.github/ISSUE_TEMPLATE/documentation-issue.md new file mode 100644 index 00000000..d8306e53 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation-issue.md @@ -0,0 +1,16 @@ +--- +name: Documentation Issue +about: 请使用此模板提出在文档中遇到的问题 +title: '' +labels: '' +assignees: '' + +--- + +## 文档链接 + + + + +## 问题描述 + diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 00000000..d7b2aa51 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,16 @@ +--- +name: Feature Request +about: 请使用此模板提出您的建议 +title: Feature Request +labels: '' +assignees: '' + +--- + + + +## 背景 + + +## 需求描述 + diff --git a/.github/ISSUE_TEMPLATE/others-issue.md b/.github/ISSUE_TEMPLATE/others-issue.md new file mode 100644 index 00000000..ce556c20 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/others-issue.md @@ -0,0 +1,10 @@ +--- +name: Others Issue +about: 如上述分类不符合,请使用此模板提出您的问题 +title: '' +labels: '' +assignees: '' + +--- + +## 请简要描述您的需求 diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..4ea31d47 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/build/ +__pycache__/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..68edae23 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,27 @@ +[submodule "third_party/Halide"] + path = third_party/Halide + url = https://github.com/halide/Halide.git +[submodule "third_party/OpenBLAS"] + path = third_party/OpenBLAS + url = https://github.com/xianyi/OpenBLAS.git +[submodule "third_party/cppzmq"] + path = third_party/cppzmq + url = https://github.com/zeromq/cppzmq.git +[submodule "third_party/gtest"] + path = third_party/gtest + url = https://github.com/google/googletest.git +[submodule "third_party/mkl-dnn"] + path = third_party/intel-mkl-dnn + url = https://github.com/intel/mkl-dnn.git +[submodule "third_party/libzmq"] + path = third_party/libzmq + url = https://github.com/zeromq/libzmq.git +[submodule "third_party/protobuf"] + path = third_party/protobuf + url = https://github.com/protocolbuffers/protobuf +[submodule "third_party/MegRay"] + path = third_party/MegRay + url = https://github.com/MegEngine/MegRay.git +[submodule "third_party/flatbuffers"] + path = third_party/flatbuffers + url = https://github.com/google/flatbuffers.git diff --git a/ACKNOWLEDGMENTS b/ACKNOWLEDGMENTS new file mode 100644 index 00000000..80cb440b --- /dev/null +++ b/ACKNOWLEDGMENTS @@ -0,0 +1,2194 @@ +MegEngine is licensed under the Apache License Version 2.0, except for the third-party components listed below. + +********************************************************************************************************************************* +Software Licensed under the MIT License: +-------------------------------------------------------------------- +1. xxhashct +Copyright (c) 2015 Daniel Kirchner + +2. cppzmq +Copyright (c) 2016-2017 ZeroMQ community +Copyright (c) 2009-2011 250bpm s.r.o. +Copyright (c) 2011 Botond Ballo +Copyright (c) 2007-2009 iMatix Corporation +Copyright (c) 2016 VOCA AS / Harald Nøkland + +3. gdrcopy +Copyright (c) 2014, NVIDIA CORPORATION + +4. stackoverflow-q2059482 +Copyright (c) 2018 Laurent LAPORTE + +5. ComputeLibrary +Copyright (c) 2017-2020 ARM Software + +6. maskrcnn-benchmark +Copyright (c) 2018 Facebook + +Terms of the MIT License: +-------------------------------------------------------------------- +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the MIT License and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +Hailde +Copyright (c) 2012-2018 MIT CSAIL, Google Inc., and other contributors + +Developed by: +The Halide team +http://halide-lang.org + +Terms of the MIT License and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +----- +apps/bgu is Copyright 2016 Google Inc. and is Licensed under the Apache License Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +----- +apps/support/cmdline.h is Copyright (c) 2009, Hideyuki Tanaka and is licensed under the BSD 3-Clause license. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the Boost Software License, Version 1.0: +-------------------------------------------------------------------- +Boost +(C) Copyright John Maddock 2006. + + +Terms of Boost Software License, Version 1.0: +--------------------------------------------------- +Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the BSD 2-Clause License: +-------------------------------------------------------------------- +xxhash +Copyright (c) 2012-2016, Yann Collet + + +Terms of the BSD 2-Clause License: +-------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the BSD 3-Clause License: +-------------------------------------------------------------------- +1. cub +Copyright (c) 2010-2011, Duane Merrill. All rights reserved. +Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + +2. OpenCV +Copyright (C) 2000-2019, Intel Corporation, all rights reserved. +Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. +Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. +Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. +Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + +3. cutlass +Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + +4. NCCL +Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +The U.S. Department of Energy funded the development of this software + under subcontract 7078610 with Lawrence Berkeley National Laboratory + +5. gtest +Copyright 2008, Google Inc. All rights reserved. + +6. ucx +Copyright (c) 2014-2015 UT-Battelle, LLC. All rights reserved. +Copyright (c) 2014-2015 Mellanox Technologies Ltd. All rights reserved. +Copyright (c) 2014-2015 The University of Houston System. All rights reserved. +Copyright (c) 2015 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. +Copyright (c) 2016 ARM Ltd. All rights reserved. +Copyright (c) 2016 Los Alamos National Security, LLC. All rights reserved. +Copyright (c) 2016-2017 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2019 UChicago Argonne, LLC. All rights reserved. +Copyright (c) 2018-2019 NVIDIA CORPORATION. All rights reserved. + +7. torchvision +Copyright (c) Soumith Chintala 2016, + +Terms of the BSD 3-Clause License: +-------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +protobuf +Copyright 2008 Google Inc. + + +Terms of the BSD 3-Clause License: +-------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +for third-party component benchmark licensed under the Apache License Version 2.0: + +Copyright 2015 Google Inc. All rights reserved. +Official list of benchmark authors for copyright purposes: +Albert Pretorius +Arne Beer +Christopher Seymour +David Coeurjolly +Dominic Hamon +Eric Fiselier +Eugene Zhuk +Evgeny Safronov +Felix Homann +Google Inc. +International Business Machines Corporation +Ismael Jimenez Martinez +Jern-Kuan Leong +Joao Paulo Magalhaes +JianXiong Zhou +Jussi Knuuttila +Kaito Udagawa +Lei Xu +Matt Clarkson +Maxim Vafin +Nick Hutchinson +Oleksandr Sochka +Paul Redmond +Radoslav Yovchev +Shuo Chen +Yixuan Qiu +Yusuke Suzuki +Dirac Research +Zbigniew Skowron +Dominik Czarnota + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +---------------- +for third-party component googletest licensed under BSD 3-Clause License: +Copyright 2008, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +OpenBLAS +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + + +Terms of the BSD 3-Clause License: +-------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +for third-party components in the folder OpenBLAS/reference licensed under the following license: + +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: +2) Are there legal restrictions on the use of BLAS reference implementation software? +The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors. Like all software, it is copyrighted. It is not trademarked, but we do ask the following: If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. + +---------------- +for third-party components in the folder OpenBLAS/lapack-netlib/ licensed under the BSD 3-Clause License: + +Copyright (c) 1992-2016 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. +Copyright (c) 2000-2016 The University of California Berkeley. All rights reserved. +Copyright (c) 2006-2016 The University of Colorado Denver. All rights reserved. + +---------------- +for third-party components in the folder OpenBLAS/lapack-netlib/LAPACKE/ licensed under the BSD 3-Clause License: + +Copyright (c) 2012, Intel Corp. All rights reserved + +---------------- +for third-party components in the folder OpenBLAS/relapack licensed under the MIT License: +Copyright (c) 2016 Elmar Peise + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +---------------- +for third-party components based on GotoBLAS2 1.13 BSD version: +Copyright 2009, 2010 The University of Texas at Austin. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of The University of Texas at Austin. +********************************************************************************************************************************* + + + + + +********************************************************************************************************************************* +Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +PyTorch + +From PyTorch: + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + + +From Caffe2: +Copyright (c) 2016-present, Facebook Inc. All rights reserved. + +All contributions by Facebook: +Copyright (c) 2016 Facebook Inc. + +All contributions by Google: +Copyright (c) 2015 Google Inc. +All rights reserved. + +All contributions by Yangqing Jia: +Copyright (c) 2015 Yangqing Jia +All rights reserved. + +All contributions from Caffe: +Copyright(c) 2013, 2014, 2015, the respective contributors +All rights reserved. + +All other contributions: +Copyright(c) 2015, 2016 the respective contributors +All rights reserved. + +Caffe2 uses a copyright model similar to Caffe: each contributor holds copyright over their contributions to Caffe2. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. + +All rights reserved. + +Terms of the BSD 3-Clause License: +-------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +Early development of Caffe2 in 2015 and early 2016 is licensed under the BSD license. The license is attached below: + +All contributions by Facebook: +Copyright (c) 2016 Facebook Inc. + +All contributions by Google: +Copyright (c) 2015 Google Inc. +All rights reserved. + +All contributions by Yangqing Jia: +Copyright (c) 2015 Yangqing Jia +All rights reserved. + +All other contributions: +Copyright(c) 2015, 2016 the respective contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +Some parts of the caffe2 code is derived from the original Caffe code, which is +created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe +license is as follows: + +COPYRIGHT + +All contributions by the University of California: +Copyright (c) 2014, The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014, the respective contributors +All rights reserved. + +Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +This repo contains Caffe2 code, which was previously licensed under Apache License Version 2.0: + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. +********************************************************************************************************************************* + + + + + + + + +********************************************************************************************************************************* +Software Licensed under the GNU LESSER GENERAL PUBLIC LICENSE Version 3 with Special Exception: +-------------------------------------------------------------------- +Libzmq +Copyright (c) 2007-2020 Contributors as noted in the AUTHORS file at https://github.com/zeromq/libzmq/blob/master/AUTHORS +The source code of this software can be obtained from: https://github.com/zeromq/libzmq/archive/master.zip + + +SPECIAL EXCEPTION GRANTED BY COPYRIGHT HOLDERS + +As a special exception, copyright holders give you permission to link this library with independent modules to produce an executable, +regardless of the license terms of these independent modules, and to copy and distribute the resulting executable under terms of your +choice, provided that you also meet, for each linked independent module, the terms and conditions of the license of that module. An +independent module is a module which is not derived from or based on this library. If you modify this library, you must extend this +exception to your version of the library. + +Note: this exception relieves you of any obligations under sections 4 and 5 of this license, and section 6 of the GNU General Public License. + + +Terms of the GNU LESSER GENERAL PUBLIC LICENSE Version 3 with Special Exception: +-------------------------------------------------------------------- +GNU LESSER GENERAL PUBLIC LICENSE +Version 3, 29 June 2007 + +Copyright (C) 2007 Free Software Foundation, Inc. + +Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. + +This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. + +0. Additional Definitions. + +As used herein, “this License” refers to version 3 of the GNU Lesser General Public License, and the “GNU GPL” refers to version 3 of the GNU General Public License. + +“The Library” refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. + +An “Application” is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. + +A “Combined Work” is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the “Linked Version”. + +The “Minimal Corresponding Source” for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. + +The “Corresponding Application Code” for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. + +1. Exception to Section 3 of the GNU GPL. + +You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. + +2. Conveying Modified Versions. + +If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: + +a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or +b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. + +3. Object Code Incorporating Material from Library Header Files. + +The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: + +a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. +b) Accompany the object code with a copy of the GNU GPL and this license document. + +4. Combined Works. + +You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: + +a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. +b) Accompany the Combined Work with a copy of the GNU GPL and this license document. +c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. +d) Do one of the following: +0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. +1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. +e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) + +5. Combined Libraries. + +You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: + +a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. +b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. + +6. Revised Versions of the GNU Lesser General Public License. + +The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. + +If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the Apache License Version 2.0: +-------------------------------------------------------------------- +1. MNN +Copyright (c) 2018, Alibaba Group Holding Limited + +This software has been modified by Megvii Inc. + +2. cuda-convnet2 +Copyright 2014 Google Inc. All rights reserved. + +This software has been modified by Megvii Inc. + +3. cython +Copyright The Cython compiler, http://cython.org +Author Robert Bradshaw, Stefan Behnel, Dag Seljebotn, Greg Ewing, et al. + +Cython, which derives from Pyrex, is licensed under the Apache 2.0 +Software License. More precisely, all modifications and new code +made to go from Pyrex to Cython are so licensed. +The original Pyrex code as of 2006-04 is licensed under the following +license: "Copyright stuff: Pyrex is free of restrictions. You may use, +redistribute, modify and distribute modified versions." +Greg Ewing, Computer Science Dept, University of Canterbury, Christchurch, New Zealand +A citizen of NewZealandCorp, a wholly-owned subsidiary of USA Inc. + +This software has been modified by Megvii Inc. + +4. FlatBuffers +Copyright 2014 Google Inc. All rights reserved. + + + +Terms of Apache License Version 2.0 +--------------------------------------------------- +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the Apache License Version 2.0 and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +Deep Neural Network Library (DNNL) +Copyright 2019 Intel Corporation. All rights reserved. + + +Terms of Apache License Version 2.0 +--------------------------------------------------- +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +---------------- +for third-party components in the folder /src/cpu/xbyak licensed under the following license: + +Copyright (c) 2007 MITSUNARI Shigeo +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ソースコード形式かバイナリ形式か、変更するかしないかを問わず、以下の条件を満たす場合に限り、再頒布および使用が許可されます。 + +ソースコードを再頒布する場合、上記の著作権表示、本条件一覧、および下記免責条項を含めること。 +バイナリ形式で再頒布する場合、頒布物に付属のドキュメント等の資料に、上記の著作権表示、本条件一覧、および下記免責条項を含めること。 +書面による特別の許可なしに、本ソフトウェアから派生した製品の宣伝または販売促進に、著作権者の名前またはコントリビューターの名前を使用してはならない。 + +本ソフトウェアは、著作権者およびコントリビューターによって「現状のまま」提供されており、明示黙示を問わず、商業的な使用可能性、および特定の目的に対する適合性に関する暗黙の保証も含め、またそれに限定されない、いかなる保証もありません。 +著作権者もコントリビューターも、事由のいかんを問わず、 損害発生の原因いかんを問わず、かつ責任の根拠が契約であるか厳格責任であるか(過失その他の)不法行為であるかを問わず、仮にそのような損害が発生する可能性を知らされていたとしても、本ソフトウェアの使用によって発生した(代替品または代用サービスの調達、使用の喪失、データの喪失、利益の喪失、業務の中断も含め、またそれに限定されない)直接損害、間接損害、偶発的な損害、特別損害、懲罰的損害、または結果損害について、一切責任を負わないものとします。 + +---------------- +for third-party components in the folder /src/cpu/jit_utils/jitprofiling licensed under the following license: + +Copyright (c) 2011, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +for third-party components in the folder /cmake licensed under the following license: + +CMake - Cross Platform Makefile Generator +Copyright 2000-2019 Kitware, Inc. and Contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following individuals and institutions are among the Contributors: + +* Aaron C. Meadows +* Adriaan de Groot +* Aleksey Avdeev +* Alexander Neundorf +* Alexander Smorkalov +* Alexey Sokolov +* Alex Merry +* Alex Turbov +* Andreas Pakulat +* Andreas Schneider +* André Rigland Brodtkorb +* Axel Huebl, Helmholtz-Zentrum Dresden - Rossendorf +* Benjamin Eikel +* Bjoern Ricks +* Brad Hards +* Christopher Harvey +* Christoph Grüninger +* Clement Creusot +* Daniel Blezek +* Daniel Pfeifer +* Enrico Scholz +* Eran Ifrah +* Esben Mose Hansen, Ange Optimization ApS +* Geoffrey Viola +* Google Inc +* Gregor Jasny +* Helio Chissini de Castro +* Ilya Lavrenov +* Insight Software Consortium +* Jan Woetzel +* Julien Schueller +* Kelly Thompson +* Laurent Montel +* Konstantin Podsvirov +* Mario Bensi +* Martin Gräßlin +* Mathieu Malaterre +* Matthaeus G. Chajdas +* Matthias Kretz +* Matthias Maennich +* Michael Hirsch, Ph.D. +* Michael Stürmer +* Miguel A. Figueroa-Villanueva +* Mike Jackson +* Mike McQuaid +* Nicolas Bock +* Nicolas Despres +* Nikita Krupen'ko +* NVIDIA Corporation +* OpenGamma Ltd. +* Patrick Stotko +* Per Øyvind Karlsen +* Peter Collingbourne +* Petr Gotthard +* Philip Lowman +* Philippe Proulx +* Raffi Enficiaud, Max Planck Society +* Raumfeld +* Roger Leigh +* Rolf Eike Beer +* Roman Donchenko +* Roman Kharitonov +* Ruslan Baratov +* Sebastian Holtermann +* Stephen Kelly +* Sylvain Joubert +* Thomas Sondergaard +* Tobias Hunger +* Todd Gamblin +* Tristan Carel +* University of Dundee +* Vadim Zhukov +* Will Dicharry + +See version control history for details of individual contributions. + +The above copyright and license notice applies to distributions of CMake in source and binary form. Third-party software packages supplied with CMake under compatible licenses provide their own copyright notices documented in corresponding subdirectories or source files. + +CMake was initially developed by Kitware with the following sponsorship: + + * National Library of Medicine at the National Institutes of Health as part of the Insight Segmentation and Registration Toolkit (ITK). + * US National Labs (Los Alamos, Livermore, Sandia) ASC Parallel Visualization Initiative. + * National Alliance for Medical Image Computing (NAMIC) is funded by the National Institutes of Health through the NIH Roadmap for Medical Research, Grant U54 EB005149. + * Kitware, Inc. + +---------------- +for third-party components in the folder /doc/assets/mathjax licensed under the following license: + +MathJax.js +Copyright (c) 2009-2018 The MathJax Consortium + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the NVIDIA Software License Agreement and CUDA Supplement to Software License Agreement and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +CUDA +Copyright NVIDIA Corporation. All rights reserved. + + +Terms of NVIDIA Software License Agreement and CUDA Supplement to Software License Agreement +--------------------------------------------------- +Terms of License Agreement for NVIDIA Software Development Kits + +Release Date: May 21, 2019 +Important Notice—Read before downloading, installing, copying or using the licensed software: +This license agreement, including exhibits attached ("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of a NVIDIA software development kit (“SDK”). + +Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation. + +This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used. + +If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent. + +If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK. + +You agree to use the SDK only for purposes that are permitted by (a) this Agreement, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. + +1.1. License +1.1.1. License Grant +Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to: + +Install and use the SDK, +Modify and create derivative works of sample source code delivered in the SDK, and +Distribute those portions of the SDK that are identified in this Agreement as distributable, as incorporated in object code format into a software application that meets the distribution requirements indicated in this Agreement. +1.1.2. Distribution Requirements +These are the distribution requirements for you to exercise the distribution grant: +Your application must have material additional functionality, beyond the included portions of the SDK. +The distributable portions of the SDK shall only be accessed by your application. +The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.” +Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only. +The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. +You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SDK not in compliance with the requirements of this Agreement, and to enforce the terms of your agreements with respect to distributed SDK. +1.1.3. Authorized Users +You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf. + +If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network. + +You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences. + +1.1.4. Pre-Release SDK +The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss. + +You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems. + +NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability. + +1.1.5. Updates +NVIDIA may, at its option, make available patches, workarounds or other updates to this SDK. Unless the updates are provided with their separate governing terms, they are deemed part of the SDK licensed to you as provided in this Agreement. You agree that the form and content of the SDK that NVIDIA provides may change without prior notice to you. While NVIDIA generally maintains compatibility between versions, NVIDIA may in some cases make changes that introduce incompatibilities in future versions of the SDK. + +1.1.6. Third Party Licenses +The SDK may come bundled with, or otherwise include or be distributed with, third party software licensed by a NVIDIA supplier and/or open source software provided under an open source license. Use of third party software is subject to the third-party license terms, or in the absence of third party terms, the terms of this Agreement. Copyright to third party software is held by the copyright holders indicated in the third-party software or license. + +1.1.7. Reservation of Rights +NVIDIA reserves all rights, title, and interest in and to the SDK, not expressly granted to you under this Agreement. + +1.2. Limitations +The following license limitations apply to your use of the SDK: +You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK. +Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. For clarity, you may not distribute or sublicense the SDK as a stand-alone product. +Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA. +You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK. +You may not use the SDK in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SDK be: +Disclosed or distributed in source code form; +Licensed for the purpose of making derivative works; or +Redistributable at no charge. +Unless you have an agreement with NVIDIA for this purpose, you may not use the SDK with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in nuclear, avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SDK for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to your use of the SDK outside of the scope of this Agreement, or not in compliance with its terms. + +1.3. Ownership +NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications and derivative works, including their respective intellectual property rights, subject to your rights described in this section. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights. +You hold all rights, title and interest in and to your applications and your derivative works of the sample source code delivered in the SDK, including their respective intellectual property rights, subject to NVIDIA’s rights described in this section. +You may, but don’t have to, provide to NVIDIA suggestions, feature requests or other feedback regarding the SDK, including possible enhancements or modifications to the SDK. For any feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) it without the payment of any royalties or fees to you. NVIDIA will use feedback at its choice. NVIDIA is constantly looking for ways to improve its products, so you may send feedback to NVIDIA through the developer portal at https://developer.nvidia.com. + +1.4. No Warranties +THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE. + +1.5. Limitation of Liability +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. + +These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different. + +1.6. Termination +This Agreement will continue to apply until terminated by either you or NVIDIA as described below. +If you want to terminate this Agreement, you may do so by stopping to use the SDK. +NVIDIA may, at any time, terminate this Agreement if: +(i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); +(ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or +(iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. +Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the license grant provisions. + +1.7. General +If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified. + +You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement. + +This Agreement will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. + +The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. + +If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, such provision will be construed as limited to the extent necessary to be consistent with and fully enforceable under the law and the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative. + +Each party acknowledges and agrees that the other is an independent contractor in the performance of this Agreement. + +The SDK has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this Agreement pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (c)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. + +The SDK is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SDK into any country, or use the SDK in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this Agreement, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SDK. + +Any notice delivered by NVIDIA to you under this Agreement will be delivered via mail, email or fax. You agree that any notices that NVIDIA sends you electronically will satisfy any legal communication requirements. Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. + +This Agreement and any exhibits incorporated into this Agreement constitute the entire agreement of the parties with respect to the subject matter of this Agreement and supersede all prior negotiations or documentation exchanged between the parties relating to this SDK license. Any additional and/or conflicting terms on documents issued by you are null, void, and invalid. Any amendment or waiver under this Agreement shall be in writing and signed by representatives of both parties. + +2. CUDA Toolkit Supplement to Software License Agreement for NVIDIA Software Development Kits +Release date: August 16, 2018 +The terms in this supplement govern your use of the NVIDIA CUDA Toolkit SDK under the terms of your license agreement (“Agreement”) as modified by this supplement. Capitalized terms used but not defined below have the meaning assigned to them in the Agreement. + +This supplement is an exhibit to the Agreement and is incorporated as an integral part of the Agreement. In the event of conflict between the terms in this supplement and the terms in the Agreement, the terms in this supplement govern. + +2.1. License Scope +The SDK is licensed for you to develop applications only for use in systems with NVIDIA GPUs. + +2.2. Distribution +The portions of the SDK that are distributable under the Agreement are listed in Attachment A. + +2.3. Operating Systems +Those portions of the SDK designed exclusively for use on the Linux or FreeBSD operating systems, or other operating systems derived from the source code to these operating systems, may be copied and redistributed for use in accordance with this Agreement, provided that the object code files are not modified in any way (except for unzipping of compressed files). + +2.4. Audio and Video Encoders and Decoders +You acknowledge and agree that it is your sole responsibility to obtain any additional third-party licenses required to make, have made, use, have used, sell, import, and offer for sale your products or services that include or incorporate any third-party software and content relating to audio and/or video encoders and decoders from, including but not limited to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A., MPEG-LA, and Coding Technologies. NVIDIA does not grant to you under this Agreement any necessary patent or other rights with respect to any audio and/or video encoders and decoders. + +2.5. Licensing +If the distribution terms in this Agreement are not suitable for your organization, or for any questions regarding this Agreement, please contact NVIDIA at nvidia-compute-license-questions@nvidia.com. + +2.6. Attachment A +The following CUDA Toolkit files may be distributed with Licensee Applications developed by you, including certain variations of these files that have version number or architecture specific information embedded in the file name - as an example only, for release version 6.0 of the 64-bit Windows software, the file cudart64_60.dll is redistributable. + +See attachment A at https://docs.nvidia.com/cuda/eula/index.html#attachment-a + +The NVIDIA CUDA Driver Libraries are only distributable in applications that meet this criteria: + +1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and +2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only. +In addition to the rights above, for parties that are developing software intended solely for use on Jetson development kits or Jetson modules, and running Linux for Tegra software, the following shall apply: +The SDK may be distributed in its entirety, as provided by NVIDIA, and without separation of its components, for you and/or your licensees to create software development kits for use only on the Jetson platform and running Linux for Tegra software. + +---------------- +Some of the cuBLAS library routines were written by or derived from code written by Vasily Volkov and are subject to the Modified Berkeley Software Distribution License as follows: +Copyright (c) 2007-2009, Regents of the University of California + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +Some of the cuBLAS library routines were written by or derived from code written by Davide Barbieri and are subject to the Modified Berkeley Software Distribution License as follows: +Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +Some of the cuBLAS library routines were derived from code developed by the University of Tennessee and are subject to the Modified Berkeley Software Distribution License as follows: +Copyright (c) 2010 The University of Tennessee. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +Some of the cuBLAS library routines were written by or derived from code written by Jonathan Hogg and are subject to the Modified Berkeley Software Distribution License as follows: +Copyright (c) 2012, The Science and Technology Facilities Council (STFC). + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------- +Some of the cuBLAS library routines were written by or derived from code written by Ahmad M. Abdelfattah, David Keyes, and Hatem Ltaief, and are subject to the license as follows: + +(C) Copyright 2013 King Abdullah University of Science and Technology +Authors: +Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa) +David Keyes (david.keyes@kaust.edu.sa) +Hatem Ltaief (hatem.ltaief@kaust.edu.sa) + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the Software License Agreement (SLA) for NVIDIA cuDNN: +-------------------------------------------------------------------- +cuDNN +Copyright NVIDIA Corporation All rights reserved. + +Terms of Software License Agreement (SLA) for NVIDIA cuDNN +--------------------------------------------------- +LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS +This license agreement, including exhibits attached ("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of a NVIDIA software development kit (“SDK”). + +Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation. + +This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used. + +If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent. + +If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK. + +You agree to use the SDK only for purposes that are permitted by (a) this Agreement, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. + +1. License. +1.1. Grant +Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to: + +Install and use the SDK, +Modify and create derivative works of sample source code delivered in the SDK, and +Distribute those portions of the SDK that are identified in this Agreement as distributable, as incorporated in object code format into a software application that meets the distribution requirements indicated in this Agreement. + +1.2. Distribution Requirements +These are the distribution requirements for you to exercise the distribution grant: + +Your application must have material additional functionality, beyond the included portions of the SDK. +The distributable portions of the SDK shall only be accessed by your application. +The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.” +Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only. +The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. +You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SDK not in compliance with the requirements of this Agreement, and to enforce the terms of your agreements with respect to distributed SDK. + +1.3. Authorized Users +You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf. + +If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network. + +You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences. + +1.4. Pre-Release SDK +The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss. + +You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems. + +NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability. + +1.5. Updates +NVIDIA may, at its option, make available patches, workarounds or other updates to this SDK. Unless the updates are provided with their separate governing terms, they are deemed part of the SDK licensed to you as provided in this Agreement. + +You agree that the form and content of the SDK that NVIDIA provides may change without prior notice to you. While NVIDIA generally maintains compatibility between versions, NVIDIA may in some cases make changes that introduce incompatibilities in future versions of the SDK. + +1.6. Third Party Licenses +The SDK may come bundled with, or otherwise include or be distributed with, third party software licensed by a NVIDIA supplier and/or open source software provided under an open source license. Use of third party software is subject to the third-party license terms, or in the absence of third party terms, the terms of this Agreement. Copyright to third party software is held by the copyright holders indicated in the third-party software or license. + +1.7. Reservation of Rights +NVIDIA reserves all rights, title and interest in and to the SDK not expressly granted to you under this Agreement. + +2. Limitations. +The following license limitations apply to your use of the SDK: + +2.1 You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK. + +2.2 Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. + +2.3 Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA. + +2.4 You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK. + +2.5 You may not use the SDK in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SDK be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. + +2.6 Unless you have an agreement with NVIDIA for this purpose, you may not use the SDK with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SDK for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. + +2.7 You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to your use of the SDK outside of the scope of this Agreement, or not in compliance with its terms. + +3. Ownership. +3.1 NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications and derivative works, including their respective intellectual property rights, subject to your rights under Section 3.2. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights. + +3.2 You hold all rights, title and interest in and to your applications and your derivative works of the sample source code delivered in the SDK, including their respective intellectual property rights, subject to NVIDIA’s rights under section 3.1. + +3.3 You may, but don’t have to, provide to NVIDIA suggestions, feature requests or other feedback regarding the SDK, including possible enhancements or modifications to the SDK. For any feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) it without the payment of any royalties or fees to you. NVIDIA will use feedback at its choice. NVIDIA is constantly looking for ways to improve its products, so you may send feedback to NVIDIA through the developer portal at https://developer.nvidia.com. + +4. No Warranties. +THE SDK IS PROVIDED BY NVIDIA “ASIS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE. + +5. Limitations of Liability. +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. + +These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different. + +6. Termination. +6.1 This Agreement will continue to apply until terminated by either you or NVIDIA as described below. + +6.2 If you want to terminate this Agreement, you may do so by stopping to use the SDK. + +6.3 NVIDIA may, at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); (ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or (iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. + +6.4 Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the licenses granted to you. + +7. General. +If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified. + +You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement. + +This Agreement will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. + +The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. + +If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, such provision will be construed as limited to the extent necessary to be consistent with and fully enforceable under the law and the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative. + +The SDK has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this Agreement pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. + +The SDK is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SDK into any country, or use the SDK in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this Agreement, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SDK. + +Any notice delivered by NVIDIA to you under this Agreement will be delivered via mail, email or fax. You agree that any notices that NVIDIA sends you electronically will satisfy any legal communication requirements. Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. + +This Agreement and any exhibits incorporated into this Agreement constitute the entire agreement of the parties with respect to the subject matter of this Agreement and supersede all prior negotiations or documentation exchanged between the parties relating to this SDK license. Any additional and/or conflicting terms on documents issued by you are null, void, and invalid. Any amendment or waiver under this Agreement shall be in writing and signed by representatives of both parties. + +(v. January 28, 2020) + +cuDNN SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS +The terms in this supplement govern your use of the NVIDIA cuDNN SDK under the terms of your license agreement (“Agreement”) as modified by this supplement. Capitalized terms used but not defined below have the meaning assigned to them in the Agreement. + +This supplement is an exhibit to the Agreement and is incorporated as an integral part of the Agreement. In the event of conflict between the terms in this supplement and the terms in the Agreement, the terms in this supplement govern. + +4.1 License Scope. The SDK is licensed for you to develop applications only for use in systems with NVIDIA GPUs. + +2. Distribution. The following portions of the SDK are distributable under the Agreement: the runtime files .so and .h, cudnn64_7.dll, and cudnn.lib. + +In addition to the rights above, for parties that are developing software intended solely for use on Jetson development kits or Jetson modules and running Linux for Tegra software the following shall apply: the SDK may be distributed in its entirety, as provided by NVIDIA and without separation of its components, for you and/or your licensees to create software development kits for use only on the Jetson platform and running Linux for Tegra software. + +3. Licensing. If the distribution terms in this Agreement are not suitable for your organization, or for any questions regarding this Agreement, please contact NVIDIA at nvidia-compute-license-questions@nvidia.com. + +(v. January 28, 2020) + +Notices +Notice +THE INFORMATION IN THIS GUIDE AND ALL OTHER INFORMATION CONTAINED IN NVIDIA DOCUMENTATION REFERENCED IN THIS GUIDE IS PROVIDED “AS IS.” NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO THE INFORMATION FOR THE PRODUCT, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. Notwithstanding any damages that customer might incur for any reason whatsoever, NVIDIA’s aggregate and cumulative liability towards customer for the product described in this guide shall be limited in accordance with the NVIDIA terms and conditions of sale for the product. + +THE NVIDIA PRODUCT DESCRIBED IN THIS GUIDE IS NOT FAULT TOLERANT AND IS NOT DESIGNED, MANUFACTURED OR INTENDED FOR USE IN CONNECTION WITH THE DESIGN, CONSTRUCTION, MAINTENANCE, AND/OR OPERATION OF ANY SYSTEM WHERE THE USE OR A FAILURE OF SUCH SYSTEM COULD RESULT IN A SITUATION THAT THREATENS THE SAFETY OF HUMAN LIFE OR SEVERE PHYSICAL HARM OR PROPERTY DAMAGE (INCLUDING, FOR EXAMPLE, USE IN CONNECTION WITH ANY NUCLEAR, AVIONICS, LIFE SUPPORT OR OTHER LIFE CRITICAL APPLICATION). NVIDIA EXPRESSLY DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY OF FITNESS FOR SUCH HIGH RISK USES. NVIDIA SHALL NOT BE LIABLE TO CUSTOMER OR ANY THIRD PARTY, IN WHOLE OR IN PART, FOR ANY CLAIMS OR DAMAGES ARISING FROM SUCH HIGH RISK USES. + +NVIDIA makes no representation or warranty that the product described in this guide will be suitable for any specified use without further testing or modification. Testing of all parameters of each product is not necessarily performed by NVIDIA. It is customer’s sole responsibility to ensure the product is suitable and fit for the application planned by customer and to do the necessary testing for the application in order to avoid a default of the application or the product. Weaknesses in customer’s product designs may affect the quality and reliability of the NVIDIA product and may result in additional or different conditions and/or requirements beyond those contained in this guide. NVIDIA does not accept any liability related to any default, damage, costs or problem which may be based on or attributable to: (i) the use of the NVIDIA product in any manner that is contrary to this guide, or (ii) customer product designs. + +Other than the right for customer to use the information in this guide with the product, no other license, either expressed or implied, is hereby granted by NVIDIA under this guide. Reproduction of information in this guide is permissible only if reproduction is approved by NVIDIA in writing, is reproduced without alteration, and is accompanied by all associated conditions, limitations, and notices. + +Trademarks +NVIDIA, the NVIDIA logo, and cuBLAS, CUDA, cuDNN, DALI, DIGITS, DGX, DGX-1, DGX-2, DGX Station, DLProf, Jetson, Kepler, Maxwell, NCCL, Nsight Compute, Nsight Systems, NvCaffe, PerfWorks, Pascal, SDK Manager, Tegra, TensorRT, TensorRT Inference Server, Tesla, TF-TRT, and Volta are trademarks and/or registered trademarks of NVIDIA Corporation in the United States and other countries. Other company and product names may be trademarks of the respective companies with which they are associated. + +Copyright +© 2020 NVIDIA Corporation. All rights reserved. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the License Agreement for Software License Agreement (SLA) for NVIDIA TensorRT and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +TensorRT +Copyright NVIDIA Corporation. All rights reserved. + + +Terms of License Agreement for Software License Agreement (SLA) for NVIDIA TensorRT +---------------- +NVIDIA SOFTWARE LICENSE AGREEMENT +Important: READ BEFORE DOWNLOADING, INSTALLING, COPYING OR USING THE LICENSED SOFTWARE +This Software License Agreement ("SLA”), made and entered into as of the time and date of click through action (“Effective Date”),is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs the use of the NVIDIA computer software and the documentation made available for use with such NVIDIA software. By downloading, installing, copying, or otherwise using the NVIDIA software and/or documentation, you agree to be bound by the terms of this SLA. If you do not agree to the terms of this SLA, do not download, install, copy or use the NVIDIA software or documentation. IF YOU ARE ENTERING INTO THIS SLAON BEHALF OF A COMPANY OR OTHER LEGAL ENTITY, YOU REPRESENT THAT YOU HAVE THE LEGAL AUTHORITY TO BIND THE ENTITY TO THIS SLA, IN WHICH CASE “YOU” WILL MEAN THE ENTITY YOU REPRESENT. IF YOU DON’T HAVE SUCH AUTHORITY, OR IF YOU DON’T ACCEPT ALL THE TERMS AND CONDITIONS OF THIS SLA, THEN NVIDIA DOES NOT AGREETO LICENSE THE LICENSED SOFTWARETO YOU, AND YOU MAY NOT DOWNLOAD, INSTALL, COPY OR USE IT. + +Preface +This document is the Software License Agreement (SLA) for NVIDIA TensorRT. This document contains specific license terms and conditions for NVIDIA TensorRT. By accepting this agreement, you agree to comply with all the terms and conditions applicable to the specific product(s) included herein. + +If you are receiving TensorRT under the NVIDIA Prerelease License Agreement (also known as NPLA) or under the NVIDIA Software License Agreement (previously known as the NVIDIA Tegra Software License Agreement), your use of TensorRT is governed by such applicable terms and conditions. All other uses of TensorRT are governed by the terms and conditions of the below license agreement. + +NVIDIA SOFTWARE LICENSE AGREEMENT +Important: READ BEFORE DOWNLOADING, INSTALLING, COPYING OR USING THE LICENSED SOFTWARE +This Software License Agreement ("SLA”), made and entered into as of the time and date of click through action (“Effective Date”),is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs the use of the NVIDIA computer software and the documentation made available for use with such NVIDIA software. By downloading, installing, copying, or otherwise using the NVIDIA software and/or documentation, you agree to be bound by the terms of this SLA. If you do not agree to the terms of this SLA, do not download, install, copy or use the NVIDIA software or documentation. IF YOU ARE ENTERING INTO THIS SLAON BEHALF OF A COMPANY OR OTHER LEGAL ENTITY, YOU REPRESENT THAT YOU HAVE THE LEGAL AUTHORITY TO BIND THE ENTITY TO THIS SLA, IN WHICH CASE “YOU” WILL MEAN THE ENTITY YOU REPRESENT. IF YOU DON’T HAVE SUCH AUTHORITY, OR IF YOU DON’T ACCEPT ALL THE TERMS AND CONDITIONS OF THIS SLA, THEN NVIDIA DOES NOT AGREETO LICENSE THE LICENSED SOFTWARETO YOU, AND YOU MAY NOT DOWNLOAD, INSTALL, COPY OR USE IT. + +1. LICENSE. +1.1. License Grant +Subject to the terms of the AGREEMENT, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly set forth in a Supplement), during the applicable license term unless earlier terminated as provided below, to have Authorized Users install and use the Software, including modifications (if expressly permitted in a Supplement), in accordance with the Documentation. You are only licensed to activate and use Licensed Software for which you a have a valid license, even if during the download or installation you are presented with other product options. No Orders are binding on NVIDIA until accepted by NVIDIA. Your Orders are subject to the AGREEMENT. + +SLA Supplements: Certain Licensed Software licensed under this SLA may be subject to additional terms and conditions that will be presented to you in a Supplement for acceptance prior to the delivery of such Licensed Software under this SLA and the applicable Supplement. Licensed Software will only be delivered to you upon your acceptance of all applicable terms. + +1.2. Limited Purpose Licenses +If your license is provided for one of the purposes indicated below, then notwithstanding contrary terms in License Grant or in a Supplement, such licenses are for internal use and do not include any right or license to sub-license and distribute the Licensed Software or its output in any way in any public release, however limited, and/or in any manner that provides third parties with use of or access to the Licensed Software or its functionality or output, including (but not limited to) external alpha or beta testing or development phases. Further: +Evaluation License. You may use evaluation licenses solely for your internal evaluation of the Licensed Software for broader adoption within your Enterprise or in connection with a NVIDIA product purchase decision, and such licenses have an expiration date as indicated by NVIDIA in its sole discretion (or ninety days from the date of download if no other duration is indicated). +Educational/Academic License. You may use educational/academic licenses solely for educational purposes and all users must be enrolled or employed by an academic institution. If you do not meet NVIDIA’s academic program requirements for educational institutions, you have no rights under this license. +Test/Development License. You may use test/development licenses solely for your internal development, testing and/or debugging of your software applications or for interoperability testing with the Licensed Software, and such licenses have an expiration date as indicated by NVIDIA in its sole discretion (or one year from the date of download if no other duration is indicated). NVIDIA Confidential Information under the AGREEMENT includes output from Licensed Software developer tools identified as “Pro” versions, where the output reveals functionality or performance data pertinent to NVIDIA hardware or software products. + +1.3. Pre-Release Licenses +With respect to alpha, beta, preview, and other pre-release Software and Documentation (“Pre-Release Licensed Software”) delivered to you under the AGREEMENT you acknowledge and agree that such Pre-Release Licensed Software (i) may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercially provided NVIDIA software and documentation, and (ii) use of such Pre-Release Licensed Software may result in unexpected results, loss of data, project delays or other unpredictable damage or loss. THEREFORE, PRE-RELEASE LICENSED SOFTWARE IS NOT INTENDED FOR USE, AND SHOULD NOT BE USED, IN PRODUCTION OR BUSINESS-CRITICAL SYSTEMS. NVIDIA has no obligation to make available a commercial version of any Pre-Release Licensed Software and NVIDIA has the right to abandon development of Pre-Release Licensed Software at any time without liability. + +1.4. Enterprise and Contractor Usage +You may allow your Enterprise employees and Contractors to access and use the Licensed Software pursuant to the terms of the AGREEMENT solely to perform work on your behalf, provided further that with respect to Contractors: (i) you obtain a written agreement from each Contractor which contains terms and obligations with respect to access to and use of Licensed Software no less protective of NVIDIA than those set forth in the AGREEMENT, and (ii) such Contractor’s access and use expressly excludes any sublicensing or distribution rights for the Licensed Software. You are responsible for the compliance with the terms and conditions of the AGREEMENT by your Enterprise and Contractors. Any act or omission that, if committed by you, would constitute a breach of the AGREEMENT shall be deemed to constitute a breach of the AGREEMENT if committed by your Enterprise or Contractors. + +1.5. Services +Except as expressly indicated in an Order, NVIDIA is under no obligation to provide support for the Licensed Software or to provide any patches, maintenance, updates or upgrades under the AGREEMENT. Unless patches, maintenance, updates or upgrades are provided with their separate governing terms and conditions, they constitute Licensed Software licensed to you under the AGREEMENT. + +2. LIMITATIONS. +2.1. License Restrictions +Except as expressly authorized in the AGREEMENT, you agree that you will not (nor authorize third parties to): (i) copy and use Software that was licensed to you for use in one or more NVIDIA hardware products in other unlicensed products (provided that copies solely for backup purposes are allowed); (ii) reverse engineer, decompile, disassemble (except to the extent applicable laws specifically require that such activities be permitted) or attempt to derive the source code, underlying ideas, algorithm or structure of Software provided to you in object code form; (iii) sell, transfer, assign, distribute, rent, loan, lease, sublicense or otherwise make available the Licensed Software or its functionality to third parties (a) as an application services provider or service bureau, (b) by operating hosted/virtual system environments, (c) by hosting, time sharing or providing any other type of services, or (d) otherwise by means of the internet; (iv) modify, translate or otherwise create any derivative works of any Licensed Software; (v) remove, alter, cover or obscure any proprietary notice that appears on or with the Licensed Software or any copies thereof; (vi) use the Licensed Software, or allow its use, transfer, transmission or export in violation of any applicable export control laws, rules or regulations; (vii) distribute, permit access to, or sublicense the Licensed Software as a stand-alone product; (viii) bypass, disable, circumvent or remove any form of copy protection, encryption, security or digital rights management or authentication mechanism used by NVIDIA in connection with the Licensed Software, or use the Licensed Software together with any authorization code, serial number, or other copy protection device not supplied by NVIDIA directly or through an authorized reseller; (ix) use the Licensed Software for the purpose of developing competing products or technologies or assisting a third party in such activities; (x) use the Licensed Software with any system or application where the use or failure of such system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss including, without limitation, use in connection with any nuclear, avionics, navigation, military, medical, life support or other life critical application (“Critical Applications”), unless the parties have entered into a Critical Applications agreement; (xi) distribute any modification or derivative work you make to the Licensed Software under or by reference to the same name as used by NVIDIA; or (xii) use the Licensed Software in any manner that would cause the Licensed Software to become subject to an Open Source License. Nothing in the AGREEMENT shall be construed to give you a right to use, or otherwise obtain access to, any source code from which the Software or any portion thereof is compiled or interpreted. You acknowledge that NVIDIA does not design, test, manufacture or certify the Licensed Software for use in the context of a Critical Application and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such use. You agree to defend, indemnify and hold harmless NVIDIA and its Affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to you and your Enterprise, and their respective employees, contractors, agents, distributors, resellers, end users, officers and directors use of Licensed Software outside of the scope of the AGREEMENT or any other breach of the terms of the AGREEMENT. + +2.2. Third Party License Obligations +You acknowledge and agree that the Licensed Software may include or incorporate third party technology (collectively “Third Party Components”), which is provided for use in or with the Software and not otherwise used separately. If the Licensed Software includes or incorporates Third Party Components, then the third-party pass-through terms and conditions (“Third Party Terms”) for the particular Third Party Component will be bundled with the Software or otherwise made available online as indicated by NVIDIA and will be incorporated by reference into the AGREEMENT. In the event of any conflict between the terms in the AGREEMENT and the Third Party Terms, the Third Party Terms shall govern. Copyright to Third Party Components are held by the copyright holders indicated in the copyright notices indicated in the Third Party Terms. + +Audio/Video Encoders and Decoders: You acknowledge and agree that it is your sole responsibility to obtain any additional third party licenses required to make, have made, use, have used, sell, import, and offer for sale your products or services that include or incorporate any Third Party Components and content relating to audio and/or video encoders and decoders from, including but not limited to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A., MPEG-LA, and Coding Technologies as NVIDIA does not grant to you under the AGREEMENT any necessary patent or other rights with respect to audio and/or video encoders and decoders. + +2.3. Limited Rights +Your rights in the Licensed Software are limited to those expressly granted under the AGREEMENT and no other licenses are granted whether by implication, estoppel or otherwise. NVIDIA reserves all rights, title and interest in and to the Licensed Software not expressly granted under the AGREEMENT. + +3. CONFIDENTIALITY +Neither party will use the other party’s Confidential Information, except as necessary for the performance of the AGREEMENT, nor will either party disclose such Confidential Information to any third party, except to personnel of NVIDIA and its Affiliates, you, your Enterprise, your Enterprise Contractors, and each party’s legal and financial advisors that have a need to know such Confidential Information for the performance of the AGREEMENT, provided that each such personnel, employee and Contractor is subject to a written agreement that includes confidentiality obligations consistent with those set forth herein. Each party will use all reasonable efforts to maintain the confidentiality of all of the other party’s Confidential Information in its possession or control, but in no event less than the efforts that it ordinarily uses with respect to its own Confidential Information of similar nature and importance. The foregoing obligations will not restrict either party from disclosing the other party’s Confidential Information or the terms and conditions of the AGREEMENT as required under applicable securities regulations or pursuant to the order or requirement of a court, administrative agency, or other governmental body, provided that the party required to make such disclosure (i) gives reasonable notice to the other party to enable it to contest such order or requirement prior to its disclosure (whether through protective orders or otherwise), (ii) uses reasonable effort to obtain confidential treatment or similar protection to the fullest extent possible to avoid such public disclosure, and (iii) discloses only the minimum amount of information necessary to comply with such requirements. + +4. OWNERSHIP +You are not obligated to disclose to NVIDIA any modifications that you, your Enterprise or your Contractors make to the Licensed Software as permitted under the AGREEMENT. As between the parties, all modifications are owned by NVIDIA and licensed to you under the AGREEMENT unless otherwise expressly provided in a Supplement. The Licensed Software and all modifications owned by NVIDIA, and the respective Intellectual Property Rights therein, are and will remain the sole and exclusive property of NVIDIA or its licensors, whether the Licensed Software is separate from or combined with any other products or materials. You shall not engage in any act or omission that would impair NVIDIA’s and/or its licensors’ Intellectual Property Rights in the Licensed Software or any other materials, information, processes or subject matter proprietary to NVIDIA. NVIDIA’s licensors are intended third party beneficiaries with the right to enforce provisions of the AGREEMENT with respect to their Confidential Information and/or Intellectual Property Rights. + +5. FEEDBACK +You have no obligation to provide Feedback to NVIDIA. However, NVIDIA and/or its Affiliates may use and include any Feedback that you provide to improve the Licensed Software or other NVIDIA products, technologies or materials. Accordingly, if you provide Feedback, you agree that NVIDIA and/or its Affiliates, at their option, may, and may permit their licensees, to make, have made, use, have used, reproduce, license, distribute and otherwise commercialize the Feedback in the Licensed Software or in other NVIDIA products, technologies or materials without the payment of any royalties or fees to you. All Feedback becomes the sole property of NVIDIA and may be used in any manner NVIDIA sees fit, and you hereby assign to NVIDIA all of your right, title and interest in and to any Feedback. NVIDIA has no obligation to respond to Feedback or to incorporate Feedback into the Licensed Software. + +6. NO WARRANTIES +THE LICENSED SOFTWARE AND ANY OTHER CONFIDENTIAL INFORMATION AND/OR SERVICES ARE PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS,” AND NVIDIA EXPRESSLY DISCLAIMS ALL OTHER WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF OPERABILITY, CONDITION, VALUE, ACCURACY OF DATA, OR QUALITY, AS WELL AS ANY WARRANTIES OF MERCHANTABILITY, SYSTEM INTEGRATION, WORKMANSHIP, SUITABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE BY NVIDIA ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE. NVIDIA DOES NOT WARRANT THAT THE LICENSED SOFTWARE OR ANY OTHER CONFIDENTIAL INFORMATION AND/OR SERVICES PROVIDED BY NVIDIA UNDER THE AGREEMENT WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. YOU ACKNOWLEDGE THAT NVIDIA’S OBLIGATIONS UNDER THE AGREEMENT ARE FOR THE BENEFIT OF YOU ONLY. Nothing in this warranty section affects any statutory rights of consumers or other recipients to the extent that they cannot be waived or limited by contract under applicable law. + +7. LIMITATION OF LIABILITY +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA OR ITS LICENSORS SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THE AGREEMENT OR THE USE OR PERFORMANCE OF THE LICENSED SOFTWARE AND ANY OTHER CONFIDENTIAL INFORMATION AND/OR SERVICES PROVIDED BY NVIDIA UNDER THE AGREEMENT, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THE AGREEMENT EXCEED THE NET AMOUNTS RECEIVED BY NVIDIA FOR YOUR USE OF THE PARTICULAR LICENSED SOFTWARE DURING THE TWELVE (12) MONTHS BEFORE THE LIABILITY AROSE (or up to US$10.00 if you acquired the Licensed Software for no charge). THE NATURE OF THE LIABILITY, THE NUMBER OF CLAIMS OR SUITS OR THE NUMBER OF PARTIES WITHIN YOUR ENTERPRISE THAT ACCEPTED THE TERMS OF THE AGREEMENT SHALL NOT ENLARGE OR EXTEND THIS LIMIT. THE FOREGOING LIMITATIONS SHALL APPLY REGARDLESS OF WHETHER NVIDIA OR ITS LICENSORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND REGARDLESS OF WHETHER ANY REMEDY FAILS ITS ESSENTIAL PURPOSE. The disclaimers, exclusions and limitations of liability set forth in the AGREEMENT form an essential basis of the bargain between the parties, and, absent any such disclaimers, exclusions or limitations of liability, the provisions of the AGREEMENT, including, without limitation, the economic terms, would be substantially different. + +8. TERM AND TERMINATION. +8.1. AGREEMENT, Licenses and Services +This SLA shall become effective upon the Effective Date, each Supplement upon their acceptance, and both this SLA and Supplements shall continue in effect until your last access or use of the Licensed Software and/or services hereunder, unless earlier terminated as provided in this “Term and Termination” section. Each Licensed Software license ends at the earlier of (a) the expiration of the applicable license term, or (b) termination of such license or the AGREEMENT. Each service ends at the earlier of (x) the expiration of the applicable service term, (y) termination of such service or the AGREEMENT, or (z) expiration or termination of the associated license and no credit or refund will be provided upon the expiration or termination of the associated license for any service fees paid. + +8.2. Termination and Effect of Expiration or Termination +NVIDIA may terminate the AGREEMENT in whole or in part: (i) if you breach any term of the AGREEMENT and fail to cure such breach within thirty (30) days following notice thereof from NVIDIA (or immediately if you violate NVIDIA’s Intellectual Property Rights); (ii) if you become the subject of a voluntary or involuntary petition in bankruptcy or any proceeding relating to insolvency, receivership, liquidation or composition for the benefit of creditors, if that petition or proceeding is not dismissed with prejudice within sixty (60) days after filing, or if you cease to do business; or (iii) if you commence or participate in any legal proceeding against NVIDIA, with respect to the Licensed Software that is the subject of the proceeding during the pendency of such legal proceeding. If you or your authorized NVIDIA reseller fail to pay license fees or service fees when due then NVIDIA may, in its sole discretion, suspend or terminate your license grants, services and any other rights provided under the AGREEMENT for the affected Licensed Software, in addition to any other remedies NVIDIA may have at law or equity. Upon any expiration or termination of the AGREEMENT, a license or a service provided hereunder, (a) any amounts owed to NVIDIA become immediately due and payable, (b) you must promptly discontinue use of the affected Licensed Software and/or service, and (c) you must promptly destroy or return to NVIDIA all copies of the affected Licensed Software and all portions thereof in your possession or control, and each party will promptly destroy or return to the other all of the other party’s Confidential Information within its possession or control. Upon written request, you will certify in writing that you have complied with your obligations under this section. Upon expiration or termination of the AGREEMENT all provisions survive except for the license grant provisions. + +9. CONSENT TO COLLECTION AND USE OF INFORMATION. +You hereby agree and acknowledge that the Software may access, collect non-personally identifiable information about your Enterprise computer systems in order to properly optimize such systems for use with the Software. To the extent that you use the Software, you hereby consent to all of the foregoing, and represent and warrant that you have the right to grant such consent. In addition, you agree that you are solely responsible for maintaining appropriate data backups and system restore points for your Enterprise systems, and that NVIDIA will have no responsibility for any damage or loss to such systems (including loss of data or access) arising from or relating to (a) any changes to the configuration, application settings, environment variables, registry, drivers, BIOS, or other attributes of the systems (or any part of such systems) initiated through the Software; or (b) installation of any Software or third party software patches initiated through the Software. In certain systems you may change your system update preferences by unchecking "Automatically check for updates" in the "Preferences" tab of the control panel for the Software. + +In connection with the receipt of the Licensed Software or services you may receive access to links to third party websites and services and the availability of those links does not imply any endorsement by NVIDIA. NVIDIA encourages you to review the privacy statements on those sites and services that you choose to visit so that you can understand how they may collect, use and share personal information of individuals. NVIDIA is not responsible or liable for: (i) the availability or accuracy of such links; or (ii) the products, services or information available on or through such links; or (iii) the privacy statements or practices of sites and services controlled by other companies or organizations. + +To the extent that you or members of your Enterprise provide to NVIDIA during registration or otherwise personal information, you acknowledge that such information will be collected, used and disclosed by NVIDIA in accordance with NVIDIA's privacy policy, available at URL http://www.nvidia.com/object/privacy_policy.html. + +10. GENERAL. +This SLA, any Supplements incorporated hereto, and Orders constitute the entire agreement of the parties with respect to the subject matter hereto and supersede all prior negotiations, conversations, or discussions between the parties relating to the subject matter hereto, oral or written, and all past dealings or industry custom. Any additional and/or conflicting terms and conditions on purchase order(s) or any other documents issued by you are null, void, and invalid. Any amendment or waiver under the AGREEMENT must be in writing and signed by representatives of both parties. + +The AGREEMENT and the rights and obligations thereunder may not be assigned by you, in whole or in part, including by merger, consolidation, dissolution, operation of law, or any other manner, without written consent of NVIDIA, and any purported assignment in violation of this provision shall be void and of no effect. NVIDIA may assign, delegate or transfer the AGREEMENT and its rights and obligations hereunder, and if to a non-Affiliate you will be notified. + +Each party acknowledges and agrees that the other is an independent contractor in the performance of the AGREEMENT, and each party is solely responsible for all of its employees, agents, contractors, and labor costs and expenses arising in connection therewith. The parties are not partners, joint ventures or otherwise affiliated, and neither has any authority to make any statements, representations or commitments of any kind to bind the other party without prior written consent. + +Neither party will be responsible for any failure or delay in its performance under the AGREEMENT (except for any payment obligations) to the extent due to causes beyond its reasonable control for so long as such force majeure event continues in effect. + +The AGREEMENT will be governed by and construed under the laws of the State of Delaware and the United States without regard to the conflicts of law provisions thereof and without regard to the United Nations Convention on Contracts for the International Sale of Goods. The parties consent to the personal jurisdiction of the federal and state courts located in Santa Clara County, California. You acknowledge and agree that a breach of any of your promises or agreements contained in the AGREEMENT may result in irreparable and continuing injury to NVIDIA for which monetary damages may not be an adequate remedy and therefore NVIDIA is entitled to seek injunctive relief as well as such other and further relief as may be appropriate. If any court of competent jurisdiction determines that any provision of the AGREEMENT is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative. + +The Licensed Software has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions set forth in the AGREEMENT pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (c)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2701 San Tomas Expressway, Santa Clara, CA 95050. + +You acknowledge that the Licensed Software described under the AGREEMENT is subject to export control under the U.S. Export Administration Regulations (EAR) and economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC). Therefore, you may not export, reexport or transfer in-country the Licensed Software without first obtaining any license or other approval that may be required by BIS and/or OFAC. You are responsible for any violation of the U.S. or other applicable export control or economic sanctions laws, regulations and requirements related to the Licensed Software. By accepting this SLA, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the Licensed Software. + +Any notice delivered by NVIDIA to you under the AGREEMENT will be delivered via mail, email or fax. Please direct your legal notices or other correspondence to NVIDIA Corporation, 2701 San Tomas Expressway, Santa Clara, California 95050, United States of America, Attention: Legal Department. + +11. GLOSSARY OF TERMS +Certain capitalized terms, if not otherwise defined elsewhere in this SLA, shall have the meanings set forth below: +“Affiliate” +“Affiliate” means any legal entity that Owns, is Owned by, or is commonly Owned with a party. “Own” means having more than 50% ownership or the right to direct the management of the entity. +“AGREEMENT” +“AGREEMENT” means this SLA and all associated Supplements entered by the parties referencing this SLA. +“Authorized Users” +“Authorized Users” means your Enterprise individual employees and any of your Enterprise’s Contractors, subject to the terms of the “Enterprise and Contractors Usage” section. +“Confidential Information” +“Confidential Information” means the Licensed Software (unless made publicly available by NVIDIA without confidentiality obligations), and any NVIDIA business, marketing, pricing, research and development, know-how, technical, scientific, financial status, proposed new products or other information disclosed by NVIDIA to you which, at the time of disclosure, is designated in writing as confidential or proprietary (or like written designation), or orally identified as confidential or proprietary or is otherwise reasonably identifiable by parties exercising reasonable business judgment, as confidential. Confidential Information does not and will not include information that: (i) is or becomes generally known to the public through no fault of or breach of the AGREEMENT by the receiving party; (ii) is rightfully known by the receiving party at the time of disclosure without an obligation of confidentiality; (iii) is independently developed by the receiving party without use of the disclosing party’s Confidential Information; or (iv) is rightfully obtained by the receiving party from a third party without restriction on use or disclosure. +“Contractor” +“Contractor” means an individual who works primarily for your Enterprise on a contractor basis from your secure network. means an individual who works primarily for your Enterprise on a contractor basis from your secure network. +“Documentation” +“Documentation” means the NVIDIA documentation made available for use with the Software, including (without limitation) user manuals, datasheets, operations instructions, installation guides, release notes and other materials provided to you under the AGREEMENT. +“Enterprise” +“Enterprise” means you or any company or legal entity for which you accepted the terms of this SLA, and their subsidiaries of which your company or legal entity owns more than fifty percent (50%) of the issued and outstanding equity. +“Feedback” +“Feedback” means any and all suggestions, feature requests, comments or other feedback regarding the Licensed Software, including possible enhancements or modifications thereto. +“Intellectual Property Rights” +“Intellectual Property Rights” means all patent, copyright, trademark, trade secret, trade dress, trade names, utility models, mask work, moral rights, rights of attribution or integrity service marks, master recording and music publishing rights, performance rights, author’s rights, database rights, registered design rights and any applications for the protection or registration of these rights, or other intellectual or industrial property rights or proprietary rights, howsoever arising and in whatever media, whether now known or hereafter devised, whether or not registered, (including all claims and causes of action for infringement, misappropriation or violation and all rights in any registrations and renewals), worldwide and whether existing now or in the future. +“Licensed Software” +“Licensed Software” means Software, Documentation and all modifications owned by NVIDIA. +“Open Source License” +“Open Source License” includes, without limitation, a software license that requires as a condition of use, modification, and/or distribution of such software that the Software be (i) disclosed or distributed in source code form; (ii) be licensed for the purpose of making derivative works; or (iii) be redistributable at no charge. +“Order” +“Order” means a purchase order issued by you, a signed purchase agreement with you, or other ordering document issued by you to NVIDIA or a NVIDIA authorized reseller (including any on-line acceptance process) that references and incorporates the AGREEMENT and is accepted by NVIDIA. +“Software” +“Software” means the NVIDIA software programs licensed to you under the AGREEMENT including, without limitation, libraries, sample code, utility programs and programming code. +“Supplement” +“Supplement” means the additional terms and conditions beyond those stated in this SLA that apply to certain Licensed Software licensed hereunder. + +12. TensorRT SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT +TensorRT SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT +The terms set forth in this TensorRT Supplement (“Supplement”) govern your use of the NVIDIA GPU inference engine (the “TensorRT Licensed Software”) under the terms of your software license agreement (“SLA”) as modified by this Supplement. This Supplement is an exhibit to the SLA and is hereby incorporated as an integral part thereto. Capitalized terms used but not defined herein shall have the meaning assigned to them in the SLA. In the event of conflict between the terms in this Supplement and the terms in the SLA, this Supplement shall control. + +12.1. TensorRT DISTRIBUTION +Subject to the terms of the SLA and this Supplement, NVIDIA hereby grants you a non-exclusive, nontransferable license during the applicable license term unless earlier terminated pursuant to the SLA, to distribute the libnvinfer, libnvinfer_plugin, and libnvparsers libraries when delivered to you as part of the TensorRT Licensed Software in source code form or binary form (but not when provided to you as part of a hardware product), subject to the following: such distribution is solely in binary form to your licensees (“Customers”) only as a component of your own software products having additional material functionality beyond the TensorRT Licensed Software (each, a “Licensee Application"). Subject to the terms and conditions of the SLA and this Supplement, you may further authorize Customers to redistribute the libnvinfer, libnvinfer_plugin, and libnvparsers libraries as incorporated into a Licensee Application, solely in binary form, provided, however, that you shall require in your agreements with your Customers that their distributions be on terms at least as restrictive as those applicable for your use of such TensorRT Licensed Software within a Licensee Application. The expiration or termination of your licenses to the above described TensorRT Licensed Software under the SLA and this Supplement will not affect rights previously granted by you to recipients that were in compliance with the SLA and this Supplement. + +In addition to the rights above, for parties that are developing software intended solely for use on Jetson development kits or Jetson modules and running Linux for Tegra software the following shall apply: TensorRT Licensed Software licensed hereunder may be distributed in its entirety, as provided by NVIDIA and without separation of its components, for you and/or your licensees to create software development kits for use only on the Jetson platform and running Linux for Tegra software. You shall require in your agreements with your licensees that their distributions be on terms at least as restrictive as those applicable for your distribution of TensorRT Licensed Software as described in this Section 1. + +In addition to the rights above, for parties that are developing software intended solely for use on Jetson development kits or Jetson modules and running Linux for Tegra software the following shall apply: TensorRT Licensed Software licensed hereunder may be distributed in its entirety, as provided by NVIDIA and without separation of its components, for you and/or your licensees to create software development kits for use only on the Jetson platform and running Linux for Tegra software. You shall require in your agreements with your licensees that their distributions be on terms at least as restrictive as those applicable for your distribution of TensorRT Licensed Software as described in this Section 1. + +12.2. LICENSE DURATION +Each TensorRT Licensed Software is licensed to you for an initial duration of one year starting from the date of delivery or download. The licenses granted will automatically renew for successive one year periods, provided that NVIDIA reserves the right to terminate licenses upon ninety days (90) days written notice to you prior to the commencement of a renewal year in addition to the termination rights set forth in the SLA. + +12.3. EXPIRATION OF TERMINATION OF THIS SUPPLEMENT +Your failure to comply with the terms of this Supplement is ground for termination for breach by NVIDIA under the SLA. This Supplement will automatically expire or terminate upon the expiration or termination of your rights to TensorRT Licensed Software under the SLA or this Supplement. + +Notices +Notice +THE INFORMATION IN THIS GUIDE AND ALL OTHER INFORMATION CONTAINED IN NVIDIA DOCUMENTATION REFERENCED IN THIS GUIDE IS PROVIDED “AS IS.” NVIDIA MAKES NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO THE INFORMATION FOR THE PRODUCT, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. Notwithstanding any damages that customer might incur for any reason whatsoever, NVIDIA’s aggregate and cumulative liability towards customer for the product described in this guide shall be limited in accordance with the NVIDIA terms and conditions of sale for the product. + +THE NVIDIA PRODUCT DESCRIBED IN THIS GUIDE IS NOT FAULT TOLERANT AND IS NOT DESIGNED, MANUFACTURED OR INTENDED FOR USE IN CONNECTION WITH THE DESIGN, CONSTRUCTION, MAINTENANCE, AND/OR OPERATION OF ANY SYSTEM WHERE THE USE OR A FAILURE OF SUCH SYSTEM COULD RESULT IN A SITUATION THAT THREATENS THE SAFETY OF HUMAN LIFE OR SEVERE PHYSICAL HARM OR PROPERTY DAMAGE (INCLUDING, FOR EXAMPLE, USE IN CONNECTION WITH ANY NUCLEAR, AVIONICS, LIFE SUPPORT OR OTHER LIFE CRITICAL APPLICATION). NVIDIA EXPRESSLY DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY OF FITNESS FOR SUCH HIGH RISK USES. NVIDIA SHALL NOT BE LIABLE TO CUSTOMER OR ANY THIRD PARTY, IN WHOLE OR IN PART, FOR ANY CLAIMS OR DAMAGES ARISING FROM SUCH HIGH RISK USES. + +NVIDIA makes no representation or warranty that the product described in this guide will be suitable for any specified use without further testing or modification. Testing of all parameters of each product is not necessarily performed by NVIDIA. It is customer’s sole responsibility to ensure the product is suitable and fit for the application planned by customer and to do the necessary testing for the application in order to avoid a default of the application or the product. Weaknesses in customer’s product designs may affect the quality and reliability of the NVIDIA product and may result in additional or different conditions and/or requirements beyond those contained in this guide. NVIDIA does not accept any liability related to any default, damage, costs or problem which may be based on or attributable to: (i) the use of the NVIDIA product in any manner that is contrary to this guide, or (ii) customer product designs. + +Other than the right for customer to use the information in this guide with the product, no other license, either expressed or implied, is hereby granted by NVIDIA under this guide. Reproduction of information in this guide is permissible only if reproduction is approved by NVIDIA in writing, is reproduced without alteration, and is accompanied by all associated conditions, limitations, and notices. + +Trademarks +NVIDIA, the NVIDIA logo, and cuBLAS, CUDA, cuDNN, DALI, DIGITS, DGX, DGX-1, DGX-2, DGX Station, DLProf, Jetson, Kepler, Maxwell, NCCL, Nsight Compute, Nsight Systems, NvCaffe, PerfWorks, Pascal, SDK Manager, Tegra, TensorRT, TensorRT Inference Server, Tesla, TF-TRT, and Volta are trademarks and/or registered trademarks of NVIDIA Corporation in the United States and other countries. Other company and product names may be trademarks of the respective companies with which they are associated. + +Copyright +© 2019 NVIDIA Corporation. All rights reserved. + +-------------------------------- +TensorRT uses elements from the following software, whose licenses are reproduced below + +Google Protobuf +--------------- +This license applies to all parts of Protocol Buffers except the following: + + - Atomicops support for generic gcc, located in + src/google/protobuf/stubs/atomicops_internals_generic_gcc.h. + This file is copyrighted by Red Hat Inc. + + - Atomicops support for AIX/POWER, located in + src/google/protobuf/stubs/atomicops_internals_power.h. + This file is copyrighted by Bloomberg Finance LP. + +Copyright 2014, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Code generated by the Protocol Buffer compiler is owned by the owner of the input file used when generating it. This code is not standalone and requires a support library to be linked with it. This support library is itself covered by the above license. + +Google Flatbuffers +------------------ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + +BVLC caffe +---------- +COPYRIGHT + +All contributions by the University of California: +Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014, 2015, the respective contributors +All rights reserved. + +Caffe uses a shared copyright model: each contributor holds copyright over +their contributions to Caffe. The project versioning records all such +contribution and copyright details. If a contributor wants to further mark +their specific copyright on a particular contribution, they should indicate +their copyright solely in the commit message of the change when it is +committed. + +LICENSE + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CONTRIBUTION AGREEMENT + +By contributing to the BVLC/caffe repository through pull-request, comment, or otherwise, the contributor releases their content to the license and copyright terms herein. + +half.h +------ +Copyright (c) 2012-2017 Christian Rau + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +jQuery.js +--------- +jQuery.js is generated automatically under doxygen. +In all cases TensorRT uses the functions under the MIT license. + +CRC +--- +TensorRT includes CRC routines from FreeBSD. + +# $FreeBSD: head/COPYRIGHT 260125 2013-12-31 12:18:10Z gjb $ +# @(#)COPYRIGHT 8.2 (Berkeley) 3/21/94 + +The compilation of software known as FreeBSD is distributed under the +following terms: + +Copyright (c) 1992-2014 The FreeBSD Project. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The 4.4BSD and 4.4BSD-Lite software is distributed under the following +terms: + +All of the documentation and software included in the 4.4BSD and 4.4BSD-Lite +Releases is copyrighted by The Regents of the University of California. + +Copyright 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994 +The Regents of the University of California. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software must display the following acknowledgement: +This product includes software developed by the University of California, Berkeley and its contributors. +4. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The Institute of Electrical and Electronics Engineers and the American National Standards Committee X3, on Information Processing Systems have given us permission to reprint portions of their documentation. + +In the following statement, the phrase ``this text'' refers to portions of the system documentation. + +Portions of this text are reprinted and reproduced in electronic form in the second BSD Networking Software Release, from IEEE Std 1003.1-1988, IEEE Standard Portable Operating System Interface for Computer Environments (POSIX), copyright C 1988 by the Institute of Electrical and Electronics Engineers, Inc. In the event of any discrepancy between these versions and the original IEEE Standard, the original IEEE Standard is the referee document. + +In the following statement, the phrase ``This material'' refers to portions of the system documentation. + +This material is reproduced with permission from American National Standards Committee X3, on Information Processing Systems. Computer and Business Equipment Manufacturers Association (CBEMA), 311 First St., NW, Suite 500, Washington, DC 20001-2178. The developmental work of Programming Language C was completed by the X3J11 Technical Committee. + +The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of the Regents of the University of California. + +NOTE: The copyright of UC Berkeley's Berkeley Software Distribution ("BSD") source has been updated. The copyright addendum may be found at ftp://ftp.cs.berkeley.edu/pub/4bsd/README.Impt.License.Change and is included below. + +July 22, 1999 + +To All Licensees, Distributors of Any Version of BSD: + +As you know, certain of the Berkeley Software Distribution ("BSD") source code files require that further distributions of products containing all or portions of the software, acknowledge within their advertising materials that such products contain software developed by UC Berkeley and its contributors. + +Specifically, the provision reads: + +" * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors." + +Effective immediately, licensees and distributors are no longer required to include the acknowledgement within advertising materials. Accordingly, the foregoing paragraph of those BSD Unix files containing it is hereby deleted in its entirety. + +William Hoskins +Director, Office of Technology Licensing +University of California, Berkeley + +getopt.c +-------- +$OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $ +$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $ + +Copyright (c) 2002 Todd C. Miller + +Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Sponsored in part by the Defense Advanced Research Projects Agency (DARPA) and Air Force Research Laboratory, Air Force Materiel Command, USAF, under agreement number F39502-99-1-0512. + +Copyright (c) 2000 The NetBSD Foundation, Inc. +All rights reserved. + +This code is derived from software contributed to The NetBSD Foundation +by Dieter Baron and Thomas Klausner. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ONNX Model Zoo +-------------- + +MIT License + +Copyright (c) ONNX Project Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +RESNET-50 Caffe models + +The MIT License (MIT) + +Copyright (c) 2016 Shaoqing Ren + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the Intel Simplified Software License and Other Licenses of the Third-party Components therein: +-------------------------------------------------------------------- +Intel(R) Math Kernel Library +Copyright NVIDIA Corporation. All rights reserved. + + +Terms of Intel Simplified Software License +--------------------------------------------------- +Use and Redistribution. You may use and redistribute the software (the “Software”), without modification, provided the following conditions are met: +* Redistributions must reproduce the above copyright notice and the following terms of use in the Software and in the documentation and/or other materials provided with the distribution. +* Neither the name of Intel nor the names of its suppliers may be used to endorse or promote products derived from this Software without specific prior written permission. +* No reverse engineering, decompilation, or disassembly of this Software is permitted. + +Limited patent license. Intel grants you a world-wide, royalty-free, non-exclusive license under patents it now or hereafter owns or controls to make, have made, use, import, offer to sell and sell (“Utilize”) this Software, but solely to the extent that any such patent is necessary to Utilize the Software alone. The patent license shall not apply to any combinations which include this software. No hardware per se is licensed hereunder. +Third party programs. The Software may contain Third Party Programs. “Third Party Programs” are third party software, open source software or other Intel software listed in the “third-party-programs.txt” or other similarly named text file that is included with the Software. Third Party Programs, even if included with the distribution of the Software, may be governed by separate license terms, including without limitation, third party license terms, open source software notices and terms, and/or other Intel software license terms. These separate license terms may govern your use of the Third Party Programs. +DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND ATTORNEYS’ FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE MATERIALS. +LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD INTEL HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR UNAUTHORIZED USE OF THE SOFTWARE. +No support. Intel may make changes to the Software, at any time without notice, and is not obligated to support, update or provide training for the Software. +Termination. Intel may terminate your right to use the Software in the event of your breach of this Agreement and you fail to cure the breach within a reasonable period of time. +Feedback. Should you provide Intel with comments, modifications, corrections, enhancements or other input (“Feedback”) related to the Software Intel will be free to use, disclose, reproduce, license or otherwise distribute or exploit the Feedback in its sole discretion without any obligations or restrictions of any kind, including without limitation, intellectual property rights or licensing obligations. +Compliance with laws. You agree to comply with all relevant laws and regulations governing your use, transfer, import or export (or prohibition thereof) of the Software. +Governing law. All disputes will be governed by the laws of the United States of America and the State of Delaware without reference to conflict of law principles and subject to the exclusive jurisdiction of the state or federal courts sitting in the State of Delaware, and each party agrees that it submits to the personal jurisdiction and venue of those courts and waives any objections. The United Nations Convention on Contracts for the International Sale of Goods (1980) is specifically excluded and will not apply to the Software. +*Other names and brands may be claimed as the property of others. + + +---------------- +Please note that this product contains and uses libstdc++-v3 library which is distributed under version 2 of the GNU General Public License, with the "runtime exception,"; as follows (or see any header or implementation file): + +As a special exception, you may use this file as part of a free software library without restriction. Specifically, if other files instantiate templates or use macros or inline functions from this file, or you compile this file and link it with other files to produce an executable, this file does not by itself cause the resulting executable to be covered by the GNU General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU General Public License. + +The source code for this library can be obtained at: http://software.intel.com/en-us/articles/libstdc-source-files + +---------------- +OpenSSL 1.0.2o 27 Mar 2018 + +Copyright (c) 1998-2015 The OpenSSL Project +Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson +All rights reserved. + +The OpenSSL toolkit stays under a double license, i.e. both the conditions of the OpenSSL License and the original SSLeay license apply to the toolkit. See below for the actual license texts. + +OpenSSL License + +* Copyright (c) 1998-2018 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ */ + +---------------- +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the Individual or Organization ("Licensee") accessing and otherwise using this software in source or binary form and its associated documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License Agreement, BeOpen hereby grants Licensee a non-exclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use the Software alone or in any derivative version, provided, however, that the BeOpen Python License is retained in the Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" basis. + BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all respects by the law of the State of California, excluding conflict of law provisions. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between BeOpen and Licensee. This License Agreement does not grant permission to use BeOpen trademarks or trade names in a trademark sense to endorse or promote products or services of Licensee, or any third party. As an exception, the "BeOpen Python" logos available at http://www.pythonlabs.com/logos.html may be used according to the permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee agrees to be bound by the terms and conditions of this License Agreement. + +---------------- +You can get Qt source code here: + +http://registrationcenter-download.intel.com/akdlm/irc_nas/13488/qt-src-5.6.2-windows.zip +http://registrationcenter-download.intel.com/akdlm/irc_nas/13488/qt-src-5.6.2-linux.tgz +http://registrationcenter-download.intel.com/akdlm/irc_nas/13488/qt-src-5.6.2-macosx.zip + + GNU LESSER GENERAL PUBLIC LICENSE +Version 3, 29 June 2007 + +Copyright (C) 2007 Free Software Foundation, Inc. + +Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. + +This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. + +0. Additional Definitions. + +As used herein, “this License” refers to version 3 of the GNU Lesser General Public License, and the “GNU GPL” refers to version 3 of the GNU General Public License. + +“The Library” refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. + +An “Application” is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. + +A “Combined Work” is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the “Linked Version”. + +The “Minimal Corresponding Source” for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. + +The “Corresponding Application Code” for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. + +1. Exception to Section 3 of the GNU GPL. + +You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. + +2. Conveying Modified Versions. + +If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: + +a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or +b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. + +3. Object Code Incorporating Material from Library Header Files. + +The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: + +a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. +b) Accompany the object code with a copy of the GNU GPL and this license document. + +4. Combined Works. + +You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: + +a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. +b) Accompany the Combined Work with a copy of the GNU GPL and this license document. +c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. +d) Do one of the following: +0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. +1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. +e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) + +5. Combined Libraries. + +You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: + +a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. +b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. + +6. Revised Versions of the GNU Lesser General Public License. + +The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. + +If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. +********************************************************************************************************************************* + + + + + + +********************************************************************************************************************************* +Software Licensed under the BSD 2-Clause License: +-------------------------------------------------------------------- +ARM_NEON_2_x86_SSE + +created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com + +*** Copyright (C) 2012-2016 Intel Corporation. All rights reserved. + +IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. + +By downloading, copying, installing or using the software you agree to this license. +If you do not agree to this license, do not download, install, copy or use the software. + + License Agreement +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * The name of the copyright holders may not be used to endorse or promote products + derived from this software without specific prior written permission. + +This software is provided by the copyright holders and contributors "as is" and +any express or implied warranties, including, but not limited to, the implied +warranties of merchantability and fitness for a particular purpose are disclaimed. +In no event shall the Intel Corporation or contributors be liable for any direct, +indirect, incidental, special, exemplary, or consequential damages +(including, but not limited to, procurement of substitute goods or services; +loss of use, data, or profits; or business interruption) however caused +and on any theory of liability, whether in contract, strict liability, +or tort (including negligence or otherwise) arising in any way out of +the use of this software, even if advised of the possibility of such damage. +********************************************************************************************************************************* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..4504cd49 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,425 @@ +cmake_minimum_required(VERSION 3.9.0) +project(MegEngine) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) + +if(NOT MSVC) + set(CMAKE_CXX_ARCHIVE_CREATE " Dqc ") + set(CMAKE_CXX_ARCHIVE_APPEND " Dq ") + set(CMAKE_CXX_ARCHIVE_FINISH " -D ") +endif() + +include(CheckCXXCompilerFlag) +CHECK_CXX_COMPILER_FLAG(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) + +set(MGE_ARCH AUTO CACHE STRING "Architecture on which MegEngine to be built.") +set_property(CACHE MGE_ARCH PROPERTY STRINGS AUTO + x86_64 i386 + naive fallback +) + + +if(${MGE_ARCH} STREQUAL "AUTO") + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") + set(MGE_ARCH "x86_64") + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686") + set(MGE_ARCH "i386") + else() + message(FATAL "Unknown machine architecture for MegEngine.") + endif() +endif() + +CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) +if(CXX_SUPPORT_GOLD) + message("-- Using GNU gold linker.") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fuse-ld=gold") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") +endif() + +option(MGE_WITH_JIT "Build MegEngine with JIT." ON) +option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" ON) +option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) +option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) +option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) +option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) +option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) +option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) + +if(MGE_WITH_CUDA) + include(CheckLanguage) + check_language(CUDA) + if(NOT CMAKE_CUDA_COMPILER) + message(FATAL_ERROR "CUDA compiler not found in PATH") + endif() + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) +endif() + +if(NOT MGE_WITH_CUDA) + message("-- Disable JIT support, as CUDA is not enabled.") + set(MGE_WITH_JIT OFF) + set(MGE_WITH_HALIDE OFF) + message("-- Disable TensorRT support, as CUDA is not enabled.") + set(MGE_WITH_TRT OFF) +endif() + +find_package(PythonInterp 3 REQUIRED) + +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads) +if(${CMAKE_THREAD_LIBS_INIT} STREQUAL "-pthread" AND MGE_WITH_CUDA) + set_property(TARGET Threads::Threads + PROPERTY INTERFACE_COMPILE_OPTIONS "$<$:-Xcompiler=-pthread>" + "$<$>:-pthread>") +endif() +if(CMAKE_THREAD_LIBS_INIT) + add_definitions(-DMGB_HAVE_THREAD=1) +endif() + + +set(MGE_BLAS MKL CACHE STRING "BLAS implementaion used by MegEngine.") +set_property(CACHE MGE_BLAS PROPERTY STRINGS MKL OpenBLAS) +set(MGE_CUDA_GENCODE "" CACHE STRING "Overwrite -gencode specifications for CUDA") +if(NOT CMAKE_CUDA_HOST_COMPILER) + set(CMAKE_CUDA_HOST_COMPILER $(CMAKE_CXX_COMPILER)) +endif() + +option(MGE_ENABLE_RTTI "Build with RTTI" ON) +option(MGE_ENABLE_LOGGING "Build with logging" ON) +option(MGE_DEBUG_UTIL "Enable debug utility" ON) + +if(MGE_DEBUG_UTIL) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_ENABLE_DEBUG_UTIL=1") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_ENABLE_DEBUG_UTIL=0") +endif() + +if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) + message(STATUS "Setting build type to 'RelWithDebInfo' as none was specified.") + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +if(NOT MGE_ENABLE_RTTI) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") +endif() + +option(MGE_ENABLE_EXCEPTIONS "Build with exceptions" ON) +if(NOT MGE_ENABLE_EXCEPTIONS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exception") +endif() + +# RTTI +if(MGE_ENABLE_RTTI) + add_definitions(-DMEGDNN_ENABLE_MANGLING=0 -DMEGDNN_ENABLE_RTTI=1) +else() + add_definitions(-DMEGDNN_ENABLE_MANGLING=1 -DMEGDNN_ENABLE_RTTI=0) +endif() + +# Logging +if(MGE_ENABLE_LOGGING) + add_definitions(-DMEGDNN_ENABLE_LOGGING=1 -DMGB_ENABLE_LOGGING=1 -DMGB_ENABLE_JSON=1) +else() + add_definitions(-DMEGDNN_ENABLE_LOGGING=0 -DMGB_ENABLE_LOGGING=0 -DMGB_ENABLE_JSON=0) +endif() + +# Exception +if(MGE_ENABLE_EXCEPTIONS) + add_definitions(-DMEGDNN_ENABLE_EXCEPTIONS=1) +else() + message(STATUS "Exceptions disabled; MegEngine would kill itself when it is supposed to throw an exception.") + add_definitions(-DMEGDNN_ENABLE_EXCEPTIONS=0) +endif() + +if(MGE_WITH_JIT AND MGE_WITH_HALIDE) + set(HALIDE_SHARED_LIBRARY OFF CACHE BOOL "Build as a shared library") + include(cmake/Halide.cmake) + add_definitions(-DMGB_JIT_HALIDE=1) +endif() + +option(MGE_WITH_TEST "Enable test for MegEngine." OFF) +if(MGE_WITH_TEST) + include(cmake/gtest.cmake) +endif() + +option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON) + +if(NOT MGE_WITH_CUDA) + message("-- Disable distributed support, as CUDA is not enabled.") + set(MGE_WITH_DISTRIBUTED OFF) +endif() + +option(MGE_INFERENCE_ONLY "Build inference only library." OFF) +option(MGE_WITH_PYTHON_MODULE "Build MegEngine Python Module." ON) +if(MGE_INFERENCE_ONLY) + message("-- Disable distributed support for inference only build.") + set(MGE_WITH_DISTRIBUTED OFF) + message("-- Disable python module for inference only build.") + set(MGE_WITH_PYTHON_MODULE OFF) + message("-- Disable tests for inference only build.") + set(MGE_WITH_TEST OFF) +endif() + +if(MGE_WITH_DISTRIBUTED) + include(cmake/protobuf.cmake) + include(cmake/zmq.cmake) +endif() + +if(MGB_WITH_FLATBUFFERS) + include(cmake/flatbuffers.cmake) +endif() + +if(MSVC) + add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") + set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") + set(CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG") +endif() + +if(MGE_WITH_CUDA) + include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) + get_filename_component(_NAME ${path} NAME) + if(NOT ${_NAME} STREQUAL "stubs") + list(APPEND CUDA_LINK_DIRECTORIES ${path}) + endif() + endforeach() + link_directories(${CUDA_LINK_DIRECTORIES}) + + set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") + set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") + + if(NOT MGE_ENABLE_RTTI) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") + endif() + if(NOT MGE_ENABLE_EXCEPTIONS) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exception") + endif() + + if(NOT MGE_CUDA_GENCODE) + if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DMEGDNN_THREADS_512=0") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=compute_75") + elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "9.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "9.0.0") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=compute_70") + else() + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_35,code=sm_35") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") + set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=compute_61") + endif() + else() + message(FATAL_ERROR "Unsupported CUDA host arch.") + endif() + else() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DMEGDNN_THREADS_512=1") + endif() + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") + include(cmake/cudnn.cmake) + if(MGE_WITH_TRT) + include(cmake/tensorrt.cmake) + endif() + if(MGE_CUDA_USE_STATIC) + if(MGE_WITH_TRT) + list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) + else() + list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) + endif() + list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") + list(APPEND MGE_CUDA_LIBS cublasLt_static) + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") + # mark all symbols from liblapack_static.a as weak to avoid + # duplicated definition with mkl + find_library( + LAPACK_STATIC_PATH lapack_static + HINTS ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) + if(NOT LAPACK_STATIC_PATH) + message(FATAL_ERROR "liblapack_static.a not found") + endif() + set(LAPACK_STATIC_COPY_PATH ${CMAKE_CURRENT_BINARY_DIR}/liblapack_static_copy.a) + + # add a target that run objcopy + add_custom_command( + OUTPUT ${LAPACK_STATIC_COPY_PATH} + COMMAND ${CMAKE_OBJCOPY} -w -W* ${LAPACK_STATIC_PATH} ${LAPACK_STATIC_COPY_PATH} + VERBATIM) + add_custom_target(lapack_static_weak_target DEPENDS ${LAPACK_STATIC_COPY_PATH}) + + # create a library named "lapack_static_weak" + add_library(lapack_static_weak STATIC IMPORTED GLOBAL) + add_dependencies(lapack_static_weak lapack_static_weak_target) + set_target_properties( + lapack_static_weak PROPERTIES + IMPORTED_LOCATION ${LAPACK_STATIC_COPY_PATH}) + list(APPEND MGE_CUDA_LIBS lapack_static_weak ${LAPACK_STATIC_COPY_PATH}) + endif() + else() + if(MGE_WITH_TRT) + list(APPEND MGE_CUDA_LIBS libnvinfer) + endif() + list(APPEND MGE_CUDA_LIBS libcudnn) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") + list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) + endif() + endif() + + add_subdirectory(dnn/cuda-stub) + list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) + set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") +endif() + +find_program(CCACHE_BIN ccache) +if(CCACHE_BIN) + set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN}) + if(MGE_WITH_CUDA AND NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0") + message("-- Using ccache as CMAKE_CUDA_COMPILER_LAUNCHER") + set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_BIN}) + endif() +endif() + +if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + if(${MGE_BLAS} STREQUAL "MKL") + include(cmake/mkl.cmake) + set(MGE_BLAS_LIBS libmkl) + elseif(${MGE_BLAS} STREQUAL "OpenBLAS") + include(cmake/OpenBLAS.cmake) + set(MGE_BLAS_LIBS libopenblas) + else() + message(FATAL_ERROR "Unknown BLAS implementation ${MGE_BLAS}") + endif() +endif() + +option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) + +# MKLDNN build +if(MGE_WITH_MKLDNN AND ${MGE_ARCH} STREQUAL "x86_64") + add_definitions(-DMEGDNN_X86_WITH_MKL_DNN) + include(cmake/MKL_DNN.cmake) +endif() + + +add_subdirectory(dnn) + + +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DMGB_ASSERT_LOC=1") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DMGB_ASSERT_LOC=0") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DMGB_ASSERT_LOC=1") +set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -DMGB_ASSERT_LOC=0") + +if(MGE_ENABLE_RTTI) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_VERBOSE_TYPEINFO_NAME=1") +endif() + +if(MGE_ENABLE_EXCEPTIONS) + add_definitions(-DMGB_ENABLE_EXCEPTION=1) +else() + add_definitions(-DMGB_ENABLE_EXCEPTION=0) +endif() + +list(APPEND MGB_OPR_PARAM_DEFS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py) +set(MGB_OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/dnn/scripts/gen_param_defs.py) + +set(MGB_OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/opr/include/) +file(MAKE_DIRECTORY ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr) +add_custom_command( + OUTPUT + ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h + COMMAND ${PYTHON_EXECUTABLE} ${MGB_OPR_PARAM_DEFS_SCRIPT} ${MGB_OPR_PARAM_DEFS_SRCS} + ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h + DEPENDS ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_SCRIPT} + VERBATIM +) + +list(APPEND MGB_OPR_PARAM_DEFS_OUTS + ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h +) + +install(FILES ${MGB_OPR_PARAM_DEFS_OUTS} DESTINATION include/megbrain/opr/) + +list(APPEND MGB_OPR_PARAM_DEFS_INC ${MGB_OPR_PARAM_DEFS_OUT_DIR}) +add_custom_target(_mgb_opr_param_defs DEPENDS ${MGB_OPR_PARAM_DEFS_OUTS}) +add_library(mgb_opr_param_defs INTERFACE) +target_include_directories(mgb_opr_param_defs INTERFACE ${MGB_OPR_PARAM_DEFS_INC}) +add_dependencies(mgb_opr_param_defs _mgb_opr_param_defs) + +if(MGE_WITH_DISTRIBUTED) + add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/MegRay) +endif() + +add_subdirectory(src) +add_subdirectory(sdk/load-and-run) + +if(MGE_WITH_PYTHON_MODULE) + add_subdirectory(python_module) +endif() + +if(MGE_WITH_TEST AND MGE_ENABLE_RTTI) + add_subdirectory(test) +endif() + +if(TARGET _mgb) + add_custom_target( + develop + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/$ + ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/$ + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/mgb.py + ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/mgb.py + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/opr.py + ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/opr.py + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/opr_param_defs.py + ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/opr_param_defs.py + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CMAKE_CURRENT_BINARY_DIR}/python_module/megengine/_internal/include + ${CMAKE_CURRENT_SOURCE_DIR}/python_module/megengine/_internal/include + DEPENDS _mgb + VERBATIM + ) +endif() + +set(MGB_CUDA ${MGE_WITH_CUDA}) +if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") + set(MGB_ASSERT_LOC 1) +else() + set(MGB_ASSERT_LOC 0) +endif() +set(MGB_ENABLE_DEBUG_UTIL ${MGE_DEBUG_UTIL}) +set(MGB_ENABLE_LOGGING ${MGE_ENABLE_LOGGING}) +set(MGB_VERBOSE_TYPEINFO_NAME ${MGE_ENABLE_RTTI}) +set(MGB_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS}) +set(MGB_JIT ${MGE_WITH_JIT}) +set(MGB_JIT_HALIDE ${MGE_WITH_HALIDE}) +set(MGB_ENABLE_TENSOR_RT ${MGE_WITH_TRT}) +set(MGB_ENABLE_JSON ${MGE_ENABLE_LOGGING}) +set(MGB_ENABLE_GRAD NOT ${MGE_INFERENCE_ONLY}) +set(MGB_BUILD_SLIM_SERVING ${MGE_INFERENCE_ONLY}) +configure_file(src/core/include/megbrain_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h) +file(READ src/core/include/megbrain_build_config.h _CONTENT) +file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h ${_CONTENT}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h DESTINATION include) + diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..05baf556 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,47 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our community include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others’ private information, such as a physical or email address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +All MegEngine forums and spaces are meant for professional interactions, and any behavior which could reasonably be considered inappropriate in a professional setting is unacceptable. + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at megengine@megvii.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is updated from the Contributor Covenant, version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + diff --git a/CONTRIBUTOR_LICENSE_AGREEMENT.md b/CONTRIBUTOR_LICENSE_AGREEMENT.md new file mode 100644 index 00000000..d0d1e352 --- /dev/null +++ b/CONTRIBUTOR_LICENSE_AGREEMENT.md @@ -0,0 +1,29 @@ +# MegEngine Contributor License Agreement + +In order to clarify the intellectual property license granted with Contributions from any person or entity, the open source project MegEngine ("MegEngine") must have a Contributor License Agreement (CLA) on file that has been signed by each Contributor, indicating agreement to the license terms below. This license is for your protection as a Contributor as well as the protection of MegEngine and its users; it does not change your rights to use your own Contributions for any other purpose. + +This Agreement allows an individual or an entity to submit Contributions to MegEngine, to authorize Contributions submitted by its designated employees to MegEngine, and to grant copyright and patent licenses. + +thereto. You accept and agree to the following terms and conditions for Your present and future Contributions submitted to MegEngine. Except for the license granted herein to MegEngine and recipients of software distributed by MegEngine, You reserve all right, title, and interest in and to Your Contributions. + +1. **Definitions**. "You" (or "Your") shall mean the copyright owner or legal entity authorized by the copyright owner that is making this Agreement with MegEngine. For legal entities, the entity making a Contribution and all other entities that control, are controlled by, or are under common control with that entity are considered to be a single Contributor. +For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. +"Contribution" shall mean the code, documentation or any original work of authorship, including any modifications or additions to an existing work, that is intentionally submitted by You to MegEngine for inclusion in, or documentation of, any of the products owned or managed by MegEngine (the "Work"). +For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to MegEngine or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, MegEngine for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by You as "Not a Contribution." + +2. **Grant of Copyright License**. Subject to the terms and conditions of this Agreement, You hereby grant to MegEngine and to recipients of software distributed by MegEngine a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, sublicense, and distribute Your Contributions and such derivative works. + +3. **Grant of Patent License**. Subject to the terms and conditions of this Agreement, You hereby grant to MegEngine and to recipients of software distributed by MegEngine a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by You that are necessarily infringed by Your Contribution(s) alone or by combination of Your Contribution(s) with the Work to which such Contribution(s) was submitted. If any entity institutes patent litigation against You or any other entity (including a crossclaim or counterclaim in a lawsuit) alleging that Your Contribution, or the Work to which You have contributed, constitutes direct or contributory patent infringement, then any patent licenses granted to that entity under this Agreement for that Contribution or Work shall terminate as of the date such litigation is filed. + +4. You represent that You are legally entitled to grant the above license. If You are an entity, You represent further that each of Your employee designated by You is authorized to submit Contributions on behalf of You. If You are an individual and Your employer(s) has rights to intellectual property that You create that includes Your Contributions, You represent further that You have received permission to make Contributions on behalf of that employer, that Your employer has waived such rights for Your Contributions to MegEngine, or that Your employer has executed a separate CLA with MegEngine. + +5. If you do post content or submit material on MegEngine and unless we indicate otherwise, you grant MegEngine a nonexclusive, royalty-free, perpetual, irrevocable, and fully sublicensable right to use, reproduce, modify, adapt, publish, perform, translate, create derivative works from, distribute, and display such content throughout the world in any media. You grant MegEngine and sublicensees the right to use your GitHub Public Profile, including but not limited to name, that you submit in connection with such content. You represent and warrant that you own or otherwise control all of the rights to the content that you post; that the content is accurate; that use of the content you supply does not violate this policy and will not cause injury to any person or entity; and that you will indemnify MegEngine for all claims resulting from content you supply. MegEngine has the right but not the obligation to monitor and edit or remove any activity or content. MegEngine takes no responsibility and assumes no liability for any content posted by you or any third party. + +6. You represent that each of Your Contributions is Your original creation. Should You wish to submit work that is not Your original creation, You may submit it to MegEngine separately from any Contribution, identifying the complete details of its source and of any license or other restriction (including, but not limited to, related patents, trademarks, and license agreements) of which You are personally aware, and conspicuously marking the work as "Submitted on behalf of a third party: [named here]". + +7. You are not expected to provide support for Your Contributions, except to the extent You desire to provide support. You may provide support for free, for a fee, or not at all. Unless required by applicable law or agreed to in writing, You provide Your Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. + +8. You agree to notify MegEngine of any facts or circumstances of which You become aware that would make these representations inaccurate in any respect. + +9. This the effective date of this Contributor License Agreement is 2020/3/23. MegEngine reserves the right to update or change this Agreement at any time, by posting the most current version of the Agreement on MegEngine, with a new effective date. All such changes in the Agreement are effective from the effective date. Your continued use of MegEngine after we post any such changes signifies your agreement to those changes. If you do not agree to the then-current Agreement, you must immediately discontinue using MegEngine. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..6badd60f --- /dev/null +++ b/LICENSE @@ -0,0 +1,74 @@ +MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + +Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md new file mode 100644 index 00000000..7f7e4810 --- /dev/null +++ b/README.md @@ -0,0 +1,139 @@ +# MegEngine + +![MegEngine Logo](logo.png) + +English | [中文](README_CN.md) + +MegEngine is a fast, scalable and easy-to-use numerical evaluation framework, with auto-differentiation. + +------ + +## Installation + +**NOTE:** MegEngine now only supports Linux platform with Python 3.5 or higher. On Windows 10 you could try [WSL(Windows Subsystem for Linux)](https://docs.microsoft.com/en-us/windows/wsl) to use Linux within Windows. + +### Binaries + +Commands to install from binaries via pip wheels are as follows: + +```bash +pip3 install megengine -f https://megengine.org.cn/whl/mge.html +``` + +## Build from Source + +### Prerequisites + +Most of the dependencies of MegEngine are located in `third_party` directory, and you do +not need to install these by yourself. you can prepare these repositories by executing: + +```bash +./third_party/prepare.sh +./third_party/install-mkl.sh +``` + +But some dependencies should be manually installed: + +* [CUDA](https://developer.nvidia.com/cuda-toolkit-archive)(>=10.1), [cuDNN](https://developer.nvidia.com/cudnn)(>=7.6)are required when building MegEngine with CUDA support (default ON) +* [TensorRT](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)(>=5.1.5) is required when building with TensorRT support (default ON) +* LLVM/Clang(>=6.0) is required when building with Halide JIT support (default ON) +* Python(>=3.5), Numpy, SWIG(>=3.0) are required to build Python modules. (default ON) + +### Build + +MegEngine prefers `Out-Of-Source` flavor, and compile in a `mostly-static` way. +Here are the instructions: + +1. Make a directory for the build. + ```bash + mkdir -p build + cd build + ``` + +2. Generate build configurations by `CMake`. + + For CUDA build: + ```bash + cmake .. -DMGE_WITH_TEST=ON + ``` + + For CPU only build, use `-DMGE_WITH_CUDA=OFF`: + ```bash + cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON + ``` + + For deployment with C++ only, use `-DMGE_INFERENCE_ONLY=ON`, and turn off test with `-DMGE_WITH_TEST=OFF`: + ```bash + cmake .. -DMGE_INFERENCE_ONLY=ON -DMGE_WITH_TEST=OFF + ``` + + Use `-DCMAKE_INSTALL_PREFIX=YOUR_PATH` to specify the install path. + + +3. Start to build. + + ```bash + make -j$(nproc) + ``` + +4. [optional] Install the library if compiled for deployment at step 2. + + ```bash + make install + ``` + +Here are some other useful options for the build. + +* `MGE_ARCH` specifies which arch MegEngine are building for. (default AUTO) +* `MGE_WITH_DISTRIBUTED` if multiple machine distributed support is enabled. (default ON) +* `MGE_WITH_PYTHON_MODULE` if build python module. (default ON) +* `MGE_BLAS` chooses `MKL` or `OpenBLAS` as BLAS library for MegEngine. (default `MKL`) +* `MGE_CUDA_GENCODE` supplies the `-gencode` option for `nvcc`. (default not supply) +* `MGE_DISABLE_FLOAT16` if disable float16 support. (default OFF) +* `MGE_ENABLE_EXCEPTIONS` if enable exception support in C++. (default ON) +* `MGE_ENABLE_LOGGING` if enable logging in MegEngine. (default AUTO) + +More options can be found by: + +```bash +cd build +cmake -LAH .. 2>/dev/null| grep -B 1 'MGE_' | less +``` + +## How to Contribute + +* MegEngine adopts [Contributor Covenant](https://contributor-covenant.org) to maintain our community. Please read the [Code of Conduct](CODE_OF_CONDUCT.md) to get more information. +* Every contributor of MegEngine must sign a Contributor License Agreement (CLA) to clarify the intellectual property license granted with the contributions. For more details, please refer [Contributor License Agreement](CONTRIBUTOR_LICENSE_AGREEMENT.md) +* You can help MegEngine better in many ways: + * Write code. + * Improve [documentation](https://github.com/MegEngine/Docs). + * Answer questions on [MegEngine Forum](https://discuss.megengine.org.cn), or Stack Overflow. + * Contribute new models in [MegEngine Model Hub](https://github.com/megengine/hub). + * Try a new idea on [MegStudio](https://studio.brainpp.com). + * Report or investigate [bugs and issues](https://github.com/MegEngine/MegEngine/issues). + * Review [Pull Requests](https://github.com/MegEngine/MegEngine/pulls). + * Star MegEngine repo. + * Reference MegEngine in your papers and articles. + * Recommend MegEngine to your friends. + * ... + +We believe we can build an open and friendly community and power humanity with AI. + +## How to contact us + +* Issue: [github.com/MegEngine/MegEngine/issues](https://github.com/MegEngine/MegEngine/issues) +* Email: [megengine-support@megvii.com](mailto:megengine-support@megvii.com) +* Forum: [discuss.megengine.org.cn](https://discuss.megengine.org.cn) +* QQ: 1029741705 + +## Resources + +- [MegEngine](https://megengine.org.cn) +- [MegStudio](https://studio.brainpp.com) +- [Brain++](https://brainpp.megvii.com) + +## License + +MegEngine is Licensed under the Apache License, Version 2.0 + +Copyright (c) 2014-2020 Megvii Inc. All rights reserved. diff --git a/README_CN.md b/README_CN.md new file mode 100644 index 00000000..093c9439 --- /dev/null +++ b/README_CN.md @@ -0,0 +1,137 @@ +# MegEngine + +![MegEngine Logo](logo.png) + +[English](README.md) | 中文 + +MegEngine 是一个快速、可拓展、易于使用且支持自动求导的数值计算框架。 + +------ + + +## 安装说明 + +**注意:** MegEngine 现在仅支持 Linux 平台安装,以及 Python3.5 及以上的版本(不支持 Python2 )。对于 Windows 10 用户,可以通过安装 [WSL(Windows Subsystem for Linux)](https://docs.microsoft.com/en-us/windows/wsl) 进行体验。 + +### 通过包管理器安装 + +通过 pip 安装的命令如下: + +```bash +pip3 install megengine -f https://megengine.org.cn/whl/mge.html +``` + +## 通过源码编译安装 + +### 环境依赖 + +大多数编译 MegEngine 的依赖位于 `third_party` 目录,可以通过以下命令自动安装: + +```bash +$ ./third_party/prepare.sh +$ ./third_party/install-mkl.sh +``` + +但是有一些依赖需要手动安装: + +* [CUDA](https://developer.nvidia.com/cuda-toolkit-archive)(>=10.1), [cuDNN](https://developer.nvidia.com/cudnn)(>=7.6) ,如果需要编译支持 CUDA 的版本(默认开启) +* [TensorRT](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)(>=5.1.5) ,如果需要编译支持 TensorRT 的版本(默认开启) +* LLVM/Clang(>=6.0) ,如果需要编译支持 Halide JIT 的版本(默认开启) +* Python(>=3.5), Numpy, SWIG(>=3.0) ,如果需要编译生成 Python 模块(默认开启) + +### 开始编译 + +MegEngine 遵循“源外构建”([Out-of-Source Build](https://zh.m.wikibooks.org/zh-hans/CMake_%E5%85%A5%E9%96%80/Out-of-source_Build))原则,并且使用静态编译方式。编译的具体流程如下: + +1. 创建用于编译的目录: + ```bash + mkdir -p build + cd build + ``` + +2. 使用 `CMake` 生成编译配置: + + 生成支持 CUDA 环境的配置: + ```bash + cmake .. -DMGE_WITH_TEST=ON + ``` + + 生成仅支持 CPU 环境的配置,使用 `-DMGE_WITH_CUDA=OFF` 选项: + ```bash + cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON + ``` + + 生成仅用于 C++ 环境部署的配置,使用 `-DMGE_INFERENCE_ONLY=ON` ,并可用 `-DMGE_WITH_TEST=OFF` 关闭测试: + ```bash + cmake .. -DMGE_INFERENCE_ONLY=ON -DMGE_WITH_TEST=OFF + ``` + + 可以使用 `-DCMAKE_INSTALL_PREFIX=YOUR_PATH` 指定具体安装目录。 + +3. 开始编译: + + ```bash + make -j$(nproc) + ``` + +4. [可选] 如果需要用于部署,可以安装 MegEngine 的 C++ 库: + + ```bash + make install + ``` + +以下是其它常用编译选项: + +* `MGE_ARCH` 指定编译的目标平台(默认自动检测当前平台) +* `MGE_WITH_DISTRIBUTED` 是否开启多机分布式支持(默认开启) +* `MGE_WITH_PYTHON_MODULE` 是否编译生成 Python 模块(默认开启) +* `MGE_BLAS` 选择 BLAS 的后端实现,可以是 `MKL` 或 `OpenBLAS` (默认 `MKL`) +* `MGE_CUDA_GENCODE` 指定提供给 `nvcc` 的 `-gencode` 选项(默认不指定) +* `MGE_DISABLE_FLOAT16` 是否不提供 `float16` 类型支持(默认关闭) +* `MGE_ENABLE_EXCEPTIONS` 是否开启 C++ 报错支持(默认开启) +* `MGE_ENABLE_LOGGING` 是否开启 MegEngine 日志信息(默认自动检测) + +更多选项可以通过以下命令查看: + +```bash +cd build +cmake -LAH .. 2>/dev/null| grep -B 1 'MGE_' | less +``` + +## 如何参与贡献 + +* MegEngine 依据 [贡献者公约(Contributor Covenant)](https://contributor-covenant.org)来管理开源社区。请阅读 [行为准则](CODE_OF_CONDUCT.md) 了解更多信息。 +* 每一名 MegEngine 的贡献者都需要签署贡献者许可协议(Contributor License Agreement,CLA)来明确贡献内容相关的知识产权许可。更多细节请参考 [协议内容](CONTRIBUTOR_LICENSE_AGREEMENT.md)。 +* 我们欢迎你通过以下方式来帮助 MegEngine 变得更好: + * 贡献代码; + * 完善[文档](https://github.com/MegEngine/Docs); + * 在 [MegEngine 论坛](https://discuss.megengine.org.cn) 和 Stack Overflow 回答问题; + * 在 [MegEngine Model Hub](https://github.com/megengine/hub) 贡献新模型; + * 在 [MegStudio](https://studio.brainpp.com) 平台尝试新想法; + * 报告使用中的 [Bugs 和 Issues](https://github.com/MegEngine/MegEngine/issues); + * 审查 [Pull Requests](https://github.com/MegEngine/MegEngine/pulls); + * 给 MegEngine 点亮小星星; + * 在你的论文和文章中引用 MegEngine; + * 向你的好友推荐 MegEngine; + * ... + +我们相信我们能够搭建一个开放友善的开源社区环境,用人工智能造福人类。 + +## 联系我们 + +* 问题: [github.com/MegEngine/MegEngine/issues](https://github.com/MegEngine/MegEngine/issues) +* 邮箱: [megengine-support@megvii.com](mailto:megengine-support@megvii.com) +* 论坛: [discuss.megengine.org.cn](https://discuss.megengine.org.cn) +* QQ: 1029741705 + +## 资源 + +- [MegEngine](https://megengine.org.cn) +- [MegStudio](https://studio.brainpp.com) +- [Brain++](https://brainpp.megvii.com) + +## 开源许可 + +MegEngine 使用 Apache License, Version 2.0 + +Copyright (c) 2014-2020 Megvii Inc. All rights reserved. diff --git a/ci/docker_env/manylinux2010/.dockerignore b/ci/docker_env/manylinux2010/.dockerignore new file mode 100644 index 00000000..b8342df1 --- /dev/null +++ b/ci/docker_env/manylinux2010/.dockerignore @@ -0,0 +1,3 @@ +/output/ +/build_image.sh +/build_wheel.sh diff --git a/ci/docker_env/manylinux2010/.gitignore b/ci/docker_env/manylinux2010/.gitignore new file mode 100644 index 00000000..16be8f21 --- /dev/null +++ b/ci/docker_env/manylinux2010/.gitignore @@ -0,0 +1 @@ +/output/ diff --git a/ci/docker_env/manylinux2010/Dockerfile b/ci/docker_env/manylinux2010/Dockerfile new file mode 100644 index 00000000..6f563617 --- /dev/null +++ b/ci/docker_env/manylinux2010/Dockerfile @@ -0,0 +1,11 @@ +FROM quay.io/pypa/manylinux2010_x86_64:2020-01-31-046f791 + +ENV UID=1024 \ + PATH=${PATH}:/usr/local/cuda/bin \ + LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ + CPATH=${CPATH}:/usr/local/cuda/include:/opt/cudnn/include:/opt/tensorrt/include + +ADD init_image.sh /tmp +RUN /tmp/init_image.sh && rm -f /tmp/init_image.sh + diff --git a/ci/docker_env/manylinux2010/build_image.sh b/ci/docker_env/manylinux2010/build_image.sh new file mode 100755 index 00000000..fd686fd4 --- /dev/null +++ b/ci/docker_env/manylinux2010/build_image.sh @@ -0,0 +1,5 @@ +#!/bin/bash -e + +cd $(dirname $0) + +docker build -t env_manylinux2010:latest . diff --git a/ci/docker_env/manylinux2010/build_wheel.sh b/ci/docker_env/manylinux2010/build_wheel.sh new file mode 100755 index 00000000..d1be8d3a --- /dev/null +++ b/ci/docker_env/manylinux2010/build_wheel.sh @@ -0,0 +1,31 @@ +#!/bin/bash -e + +CWD=$(dirname $0) +BASEDIR=$(readlink -f ${CWD}/../../..) +OUTPUTDIR=$(readlink -f ${CWD}/output) +USERID=$(id -u) +TMPFS_ARGS="--tmpfs /tmp:exec" + +pushd ${BASEDIR}/third_party >/dev/null + ./prepare.sh +popd >/dev/null + +cd ${CWD} +mkdir -p ${OUTPUTDIR} + +if [[ -z ${CUDA_ROOT_DIR} ]]; then +echo "Environment variable CUDA_ROOT_DIR not set." +exit -1 +fi +if [[ -z ${CUDNN_ROOT_DIR} ]]; then +echo "Environment variable CUDNN_ROOT_DIR not set." +exit -1 +fi +if [[ -z ${TENSORRT_ROOT_DIR} ]]; then +echo "Environment variable TENSORRT_ROOT_DIR not set." +exit -1 +fi + +docker run -it --rm $TMPFS_ARGS -e UID=${USERID} -e LOCAL_VERSION=${LOCAL_VERSION} -e ALL_PYTHON=${ALL_PYTHON} -v ${CUDA_ROOT_DIR}:/usr/local/cuda -v ${CUDNN_ROOT_DIR}:/opt/cudnn -v ${TENSORRT_ROOT_DIR}:/opt/tensorrt -v ${BASEDIR}:/home/code -v ${OUTPUTDIR}:/home/output:rw env_manylinux2010:latest /home/code/ci/docker_env/manylinux2010/do_build.sh + + diff --git a/ci/docker_env/manylinux2010/do_build.sh b/ci/docker_env/manylinux2010/do_build.sh new file mode 100755 index 00000000..384f509f --- /dev/null +++ b/ci/docker_env/manylinux2010/do_build.sh @@ -0,0 +1,56 @@ +#!/bin/bash -e +ALL_PYTHON=${ALL_PYTHON} +if [[ -z ${ALL_PYTHON} ]] +then + ALL_PYTHON="35m 36m 37m 38" +fi + +EXTRA_CMAKE_ARGS= + +for ver in ${ALL_PYTHON} +do + python_ver=${ver:0:2} + BUILD_DIR=/tmp/build_megengine/python${python_ver} + MAJOR=${python_ver:0:1} + MINOR=${ver:1} + PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/ + EXT_NAME=_mgb.cpython-${ver}-x86_64-linux-gnu.so + mkdir -p ${BUILD_DIR} + pushd ${BUILD_DIR} >/dev/null + cmake /home/code -DMGE_WITH_DISTRIBUTED=ON -DMGE_WITH_CUDA=ON \ + -DCMAKE_PREFIX_PATH=${PYTHON_DIR} \ + -DMGE_WITH_TEST=ON -DCMAKE_INSTALL_PREFIX=/home/output \ + -DPYTHON_LIBRARY=${PYTHON_DIR}lib/ \ + -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}/ \ + ${EXTRA_CMAKE_ARGS} + make -j$(nproc) + make install + mkdir -p staging + mkdir -p /home/output/debug + cp -a python_module/{megengine,setup.py} staging/ + pushd dnn/cuda-stub/ >/dev/null + strip -s libcuda.so + ln -sf libcuda.so libcuda.so.1 + popd >/dev/null + pushd staging >/dev/null + pushd megengine/_internal >/dev/null + objcopy --only-keep-debug _mgb.so ${EXT_NAME}.dbg + strip -s _mgb.so + objcopy --add-gnu-debuglink=${EXT_NAME}.dbg _mgb.so + cp -a ${EXT_NAME}.dbg /home/output/debug + mkdir -p lib/ucx + cp -L /usr/local/cuda/lib*/libnvrtc-builtins.so lib + cp -L ${BUILD_DIR}/third_party/MegRay/third_party/ucx/lib/ucx/*.so lib/ucx/ + strip -s lib/ucx/*.so + popd >/dev/null + ${PYTHON_DIR}/bin/python setup.py bdist_wheel + popd >/dev/null + popd >/dev/null + pushd /home/output >/dev/null + LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L _internal/lib ${BUILD_DIR}/staging/dist/Meg*.whl + chown -R ${UID}.${UID} . + popd >/dev/null + rm -rf ${BUILD_DIR} +done + + diff --git a/ci/docker_env/manylinux2010/init_image.sh b/ci/docker_env/manylinux2010/init_image.sh new file mode 100755 index 00000000..37511884 --- /dev/null +++ b/ci/docker_env/manylinux2010/init_image.sh @@ -0,0 +1,97 @@ +#!/bin/bash -e + +GET_PIP_URL='https://bootstrap.pypa.io/get-pip.py' +SWIG_URL='https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.gz?use_mirror=autoselect' +LLVM_URL='https://github.com/llvm-mirror/llvm/archive/release_60.tar.gz' +CLANG_URL='https://github.com/llvm-mirror/clang/archive/release_60.tar.gz' + +yum erase -y cmake cmake28 +yum install -y python34-pip pcre-devel + +pip3 install --no-cache-dir --only-binary :all: -U pip==19.1 +pip3 install --no-cache-dir --only-binary :all: cmake==3.16.3 + +for ver in 35m 36m 37m 38 +do + python_ver=${ver:0:2} + curl ${GET_PIP_URL} | /opt/python/cp${python_ver}-cp${ver}/bin/python - \ + --no-cache-dir --only-binary :all: + /opt/python/cp${python_ver}-cp${ver}/bin/pip install \ + --no-cache-dir --only-binary :all: numpy==1.18.1 +done + +pushd /home >/dev/null + curl -sSL ${SWIG_URL} | tar xz + pushd swig-3.0.12 >/dev/null + mkdir build + pushd build >/dev/null + ../configure + make -j$(nproc) + make install + popd >/dev/null + popd >/dev/null + rm -rf swig-3.0.12 + + curl -sSL ${LLVM_URL} | tar xz + pushd llvm-release_60 >/dev/null + mkdir build + pushd build >/dev/null + cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ + -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) + make install + popd >/dev/null + popd >/dev/null + rm -rf llvm-release_60 + + curl -sSL ${CLANG_URL} | tar xz + pushd clang-release_60 >/dev/null + mkdir build + pushd build >/dev/null + cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ + -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) + make install + popd >/dev/null + popd >/dev/null + rm -rf clang-release_60 +popd >/dev/null + +pushd /tmp >/dev/null + curl -sSL https://github.com/NixOS/patchelf/archive/0.10.tar.gz | tar xz + pushd /tmp/patchelf-0.10 >/dev/null + patch -p1 <<'EOF' +diff --git a/src/patchelf.cc b/src/patchelf.cc +index 0b4965a..7aae7a4 100644 +--- a/src/patchelf.cc ++++ b/src/patchelf.cc +@@ -1074,13 +1074,6 @@ void ElfFile::modifySoname(sonameMode op, const std::string & + return; + } + +- /* Zero out the previous SONAME */ +- unsigned int sonameSize = 0; +- if (soname) { +- sonameSize = strlen(soname); +- memset(soname, 'X', sonameSize); +- } +- + debug("new SONAME is '%s'\n", newSoname.c_str()); + + /* Grow the .dynstr section to make room for the new SONAME. */ +@@ -1264,7 +1257,6 @@ void ElfFile::modifyRPath(RPathOp op, + unsigned int rpathSize = 0; + if (rpath) { + rpathSize = strlen(rpath); +- memset(rpath, 'X', rpathSize); + } + + debug("new rpath is '%s'\n", newRPath.c_str()); + +EOF + ./bootstrap.sh && ./configure && make install-strip + popd + rm -rf /tmp/patchelf-0.10 +popd + +yum clean all diff --git a/cmake/Halide.cmake b/cmake/Halide.cmake new file mode 100644 index 00000000..4b145daf --- /dev/null +++ b/cmake/Halide.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) +find_package(LLVM 6.0 REQUIRED CONFIG) + +STRING(REPLACE "." ";" LLVM_VERSION_LIST ${LLVM_PACKAGE_VERSION}) +list(GET LLVM_VERSION_LIST 0 LLVM_VERSION_MAJOR) +list(GET LLVM_VERSION_LIST 1 LLVM_VERSION_MINOR) + +set(HALIDE_DIR "${PROJECT_SOURCE_DIR}/third_party/Halide" CACHE STRING "halide directory") +set(HALIDE_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/Halide) +set(HALIDE_LIB ${HALIDE_BUILD_DIR}/lib/libHalide.a) +ExternalProject_add( + halide + SOURCE_DIR ${HALIDE_DIR} + PREFIX ${HALIDE_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_INSTALL_PREFIX=${HALIDE_BUILD_DIR} -DWITH_APPS=OFF -DWITH_TESTS=OFF -DWITH_TUTORIALS=OFF -DHALIDE_SHARED_LIBRARY=OFF -DHALIDE_REQUIRE_LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DTARGET_MIPS=OFF -DTARGET_POWERPC=OFF + BUILD_BYPRODUCTS ${HALIDE_LIB} +) + +set(HALIDE_INC ${HALIDE_BUILD_DIR}/include) +file(MAKE_DIRECTORY ${HALIDE_INC}) +add_library(libhalide STATIC IMPORTED GLOBAL) +add_dependencies(libhalide halide) +set_target_properties( + libhalide PROPERTIES + IMPORTED_LOCATION ${HALIDE_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${HALIDE_INC} +) + +set(LLVM_COMPONENTS mcjit;bitwriter;linker;passes;X86;ARM;AArch64;Hexagon;NVPTX;AMDGPU) +llvm_map_components_to_libnames(HALIDE_LLVM_LIBS ${LLVM_COMPONENTS}) + diff --git a/cmake/MKL_DNN.cmake b/cmake/MKL_DNN.cmake new file mode 100644 index 00000000..a564f303 --- /dev/null +++ b/cmake/MKL_DNN.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) +include(GNUInstallDirs) + +set(MKLDNN_DIR "${PROJECT_SOURCE_DIR}/third_party/intel-mkl-dnn" CACHE STRING "mkldnn directory") +set(MKLDNN_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/intel-mkl-dnn) +set(MKLDNN_LIB ${MKLDNN_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libdnnl.a) + +if(MGE_BLAS STREQUAL "MKL") + list(APPEND MKLDNN_BUILD_ARGS -D_DNNL_USE_MKL=ON -DMKLROOT=${MKL_ROOT_DIR}) +else() + list(APPEND MKLDNN_BUILD_ARGS -D_DNNL_USE_MKL=OFF) +endif() + +ExternalProject_add( + mkl_dnn + SOURCE_DIR ${MKLDNN_DIR} + PREFIX ${MKLDNN_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_BUILD_DIR} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DDNNL_LIBRARY_TYPE=STATIC -DDNNL_CPU_RUNTIME=DNNL_RUNTIME_SEQ ${MKLDNN_BUILD_ARGS} + BUILD_BYPRODUCTS ${MKLDNN_LIB} +) + +set(MKLDNN_INC ${MKLDNN_BUILD_DIR}/include) +file(MAKE_DIRECTORY ${MKLDNN_INC}) + +add_library(libmkl_dnn STATIC IMPORTED GLOBAL) +add_dependencies(libmkl_dnn mkl_dnn) +set_target_properties( + libmkl_dnn PROPERTIES + IMPORTED_LOCATION ${MKLDNN_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${MKLDNN_INC} +) diff --git a/cmake/Modules/FindNumpy.cmake b/cmake/Modules/FindNumpy.cmake new file mode 100644 index 00000000..248f8c21 --- /dev/null +++ b/cmake/Modules/FindNumpy.cmake @@ -0,0 +1,55 @@ +# - Find the NumPy libraries +# This module finds if NumPy is installed, and sets the following variables +# indicating where it is. +# +# TODO: Update to provide the libraries and paths for linking npymath lib. +# +# NUMPY_FOUND - was NumPy found +# NUMPY_VERSION - the version of NumPy found as a string +# NUMPY_VERSION_MAJOR - the major version number of NumPy +# NUMPY_VERSION_MINOR - the minor version number of NumPy +# NUMPY_VERSION_PATCH - the patch version number of NumPy +# NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is 10601 +# NUMPY_INCLUDE_DIR - path to the NumPy include files + +unset(NUMPY_VERSION) +unset(NUMPY_INCLUDE_DIR) + +if(PYTHONINTERP_FOUND) + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import numpy as n; print(n.__version__); print(n.get_include());" + RESULT_VARIABLE __result + OUTPUT_VARIABLE __output + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(__result MATCHES 0) + string(REGEX REPLACE ";" "\\\\;" __values ${__output}) + string(REGEX REPLACE "\r?\n" ";" __values ${__values}) + list(GET __values 0 NUMPY_VERSION) + list(GET __values 1 NUMPY_INCLUDE_DIR) + + string(REGEX MATCH "^([0-9])+\\.([0-9])+\\.([0-9])+" __ver_check "${NUMPY_VERSION}") + if(NOT "${__ver_check}" STREQUAL "") + set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1}) + set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2}) + set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3}) + math(EXPR NUMPY_VERSION_DECIMAL + "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}") + string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR}) + else() + unset(NUMPY_VERSION) + unset(NUMPY_INCLUDE_DIR) + message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n") + endif() + endif() +else() + message(STATUS "To find NumPy Python interpretator is required to be found.") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION + VERSION_VAR NUMPY_VERSION) + +if(NUMPY_FOUND) + message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})") +endif() diff --git a/cmake/OpenBLAS.cmake b/cmake/OpenBLAS.cmake new file mode 100644 index 00000000..37fbfa65 --- /dev/null +++ b/cmake/OpenBLAS.cmake @@ -0,0 +1,34 @@ +include(ExternalProject) +include(GNUInstallDirs) + +set(OPENBLAS_DIR "${PROJECT_SOURCE_DIR}/third_party/OpenBLAS" CACHE STRING "OpenBLAS directory") +set(OPENBLAS_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/OpenBLAS) + +set(OPENBLAS_INC ${OPENBLAS_BUILD_DIR}/include) +set(OPENBLAS_LIB ${OPENBLAS_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libopenblas.a) + +if(${CMAKE_GENERATOR} STREQUAL "Ninja") + set(MAKE_COMMAND make) +else() + set(MAKE_COMMAND "$(MAKE)") +endif() + +ExternalProject_add( + openblas + SOURCE_DIR ${OPENBLAS_DIR} + PREFIX ${OPENBLAS_BUILD_DIR} + CMAKE_GENERATOR "Unix Makefiles" + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${OPENBLAS_BUILD_DIR} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_POSITION_INDEPENDENT_CODE=ON + BUILD_COMMAND ${MAKE_COMMAND} + BUILD_BYPRODUCTS ${OPENBLAS_LIB} ${OPENBLAS_PROTOC_EXECUTABLE} +) + +file(MAKE_DIRECTORY ${OPENBLAS_INC}) + +add_library(libopenblas STATIC IMPORTED GLOBAL) +add_dependencies(libopenblas openblas) +set_target_properties( + libopenblas PROPERTIES + IMPORTED_LOCATION ${OPENBLAS_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_BUILD_DIR}/include +) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake new file mode 100644 index 00000000..b8cef397 --- /dev/null +++ b/cmake/cudnn.cmake @@ -0,0 +1,66 @@ +find_package(PkgConfig) +if(${PkgConfig_FOUND}) + pkg_check_modules(PC_CUDNN QUIET CUDNN) +endif() + +if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "") + string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) +endif() + +if(MGE_CUDA_USE_STATIC) + find_library(CUDNN_LIBRARY + NAMES libcudnn_static.a libcudnn_static.lib + PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} + HINTS ${SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CUDNN library." ) +else() + find_library(CUDNN_LIBRARY + NAMES libcudnn.so libcudnn.dylib cudnn64.dll + PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} + HINTS ${SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "CUDNN library." ) +endif() + +if(CUDNN_LIBRARY STREQUAL "CUDNN_LIBRARY-NOTFOUND") + message(FATAL_ERROR "Can not find CuDNN Library") +endif() + +get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH) +find_path(CUDNN_INCLUDE_DIR + NAMES cudnn.h + HINTS ${PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} + PATH_SUFFIXES include + DOC "Path to CUDNN include directory." ) + +if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND") + message(FATAL_ERROR "Can not find CuDNN Library") +endif() + +file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) +string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" + CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" + CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}") +string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)" + CUDNN_MINOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1" + CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}") +string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" + CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" + CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") +set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}) + +if(MGE_CUDA_USE_STATIC) + add_library(libcudnn STATIC IMPORTED) +else() + add_library(libcudnn SHARED IMPORTED) +endif() + +set_target_properties(libcudnn PROPERTIES + IMPORTED_LOCATION ${CUDNN_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}) + +message("-- Found CuDNN: ${__found_cudnn_root} (found version: ${CUDNN_VERSION})") diff --git a/cmake/flatbuffers.cmake b/cmake/flatbuffers.cmake new file mode 100644 index 00000000..47818998 --- /dev/null +++ b/cmake/flatbuffers.cmake @@ -0,0 +1,9 @@ +if (MGE_USE_SYSTEM_LIB) + find_package(FlatBuffers REQUIRED) + return() +endif() + +option(FLATBUFFERS_BUILD_TESTS "" OFF) +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/flatbuffers + ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake new file mode 100644 index 00000000..d2be2f35 --- /dev/null +++ b/cmake/gtest.cmake @@ -0,0 +1,2 @@ +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gtest ${CMAKE_CURRENT_BINARY_DIR}/gtest EXCLUDE_FROM_ALL) + diff --git a/cmake/mkl.cmake b/cmake/mkl.cmake new file mode 100644 index 00000000..c2653fb6 --- /dev/null +++ b/cmake/mkl.cmake @@ -0,0 +1,70 @@ +find_path(MKL_ROOT_DIR + include/mkl_cblas.h + PATHS + ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH} + $ENV{MKLDIR} + /opt/intel/mkl/*/ + /opt/intel/cmkl/*/ + /Library/Frameworks/Intel_MKL.framework/Versions/Current/lib/universal +) + +if(${MKL_ROOT_DIR} STREQUAL "MKL_ROOT_DIR-NOTFOUND") + message(FATAL_ERROR "Can not find MKL") +endif() +message("-- Build with MKL in ${MKL_ROOT_DIR}") + +find_path(MKL_INCLUDE_DIR + mkl_cblas.h + PATHS + ${MKL_ROOT_DIR}/include + ${INCLUDE_INSTALL_DIR} +) + +option(MGE_MKL_USE_STATIC "Build MegEngine with static MKL" ON) +if(MGE_MKL_USE_STATIC) + find_library(MKL_CORE_LIBRARY + NAMES libmkl_core.a libmkl_core.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + + find_library(MKL_SEQUENTIAL_LIBRARY + NAMES libmkl_sequential.a libmkl_sequential.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + + if(${MGE_ARCH} STREQUAL "x86_64") + find_library(MKL_IPL_LIBRARY + NAMES libmkl_intel_ilp64.a libmkl_intel_ilp64.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + elseif(${MGE_ARCH} STREQUAL "x86_32") + find_library(MKL_IPL_LIBRARY + NAMES libmkl_intel_32.a libmkl_intel_32.lib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + endif() + + add_library(libmkl INTERFACE) + target_link_libraries(libmkl INTERFACE -Wl,--start-group ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY} -Wl,--end-group) + target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR}) +else() + find_library(MKL_CORE_LIBRARY + NAMES libmkl_core.so libmkl_core.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + + find_library(MKL_SEQUENTIAL_LIBRARY + NAMES libmkl_sequential.so libmkl_sequential.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + + if(${MGE_ARCH} STREQUAL "x86_64") + find_library(MKL_IPL_LIBRARY + NAMES libmkl_intel_ilp64.so libmkl_intel_ilp64.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + elseif(${MGE_ARCH} STREQUAL "x86_32") + find_library(MKL_IPL_LIBRARY + NAMES libmkl_intel_32.so libmkl_intel_32.dylib + PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/) + endif() + target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY}) + target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR}) +endif() + +if(${MGE_ARCH} STREQUAL "x86_64") + target_compile_definitions(libmkl INTERFACE -DMKL_ILP64) +endif() diff --git a/cmake/protobuf.cmake b/cmake/protobuf.cmake new file mode 100644 index 00000000..5802b25f --- /dev/null +++ b/cmake/protobuf.cmake @@ -0,0 +1,90 @@ +function(PROTOBUF_GENERATE_CPP_WITH_ROOT SRCS HDRS ROOT_DIR) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP_WITH_ROOT() called without any proto files") + return() + endif() + + set(${SRCS}) + set(${HDRS}) + foreach(FIL ${ARGN}) + set(ABS_FIL ${ROOT_DIR}/${FIL}) + get_filename_component(FIL_WE ${FIL} NAME_WE) + get_filename_component(FIL_DIR ${ABS_FIL} PATH) + file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR}) + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} -I ${FIL_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS} + DEPENDS ${ABS_FIL} libprotobuf + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + +if(MGE_USE_SYSTEM_LIB) + find_package(Protobuf) + if(Protobuf_FOUND) + add_library(libprotobuf INTERFACE) + target_link_libraries(libprotobuf INTERFACE ${Protobuf_LIBRARIES}) + target_include_directories(libprotobuf INTERFACE ${Protobuf_INCLUDE_DIRS}) + get_filename_component(Protobuf_ROOT ${Protobuf_INCLUDE_DIR} DIRECTORY) + set(PROTOBUF_ROOT ${Protobuf_ROOT}) + set(PROTOBUF_PROTOC_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE}) + set(PROTOBUF_INCLUDE_DIRS ${Protobuf_INCLUDE_DIRS}) + return() + endif() +endif() + + +include(ExternalProject) +include(GNUInstallDirs) + +set(PROTOBUF_DIR "${PROJECT_SOURCE_DIR}/third_party/protobuf" CACHE STRING "protobuf directory") +set(PROTOBUF_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/protobuf) + +if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobufd.a) +else() + set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobuf.a) +endif() +set(PROTOBUF_PROTOC_EXECUTABLE ${PROTOBUF_BUILD_DIR}/bin/protoc) + +ExternalProject_add( + protobuf + SOURCE_DIR ${PROTOBUF_DIR}/cmake + PREFIX ${PROTOBUF_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_BUILD_DIR} -Dprotobuf_BUILD_EXAMPLES=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON + BUILD_BYPRODUCTS ${PROTOBUF_LIB} ${PROTOBUF_PROTOC_EXECUTABLE} +) + +set(PROTOBUF_INC ${PROTOBUF_BUILD_DIR}/include) +file(MAKE_DIRECTORY ${PROTOBUF_INC}) + +add_library(libprotobuf STATIC IMPORTED GLOBAL) +add_dependencies(libprotobuf protobuf) +set_target_properties( + libprotobuf PROPERTIES + IMPORTED_LOCATION ${PROTOBUF_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_BUILD_DIR}/include +) + +add_executable(protoc IMPORTED GLOBAL) +add_dependencies(protoc protobuf) +set_target_properties( + protoc PROPERTIES + IMPORTED_LOCATION ${PROTOBUF_BUILD_DIR}/bin/protoc +) + +set(PROTOBUF_ROOT ${PROTOBUF_BUILD_DIR}) +set(PROTOBUF_PROTOC_EXECUTABLE protoc) +set(PROTOBUF_INCLUDE_DIRS ${PROTOBUF_BUILD_DIR}/include) + diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake new file mode 100644 index 00000000..7205f907 --- /dev/null +++ b/cmake/tensorrt.cmake @@ -0,0 +1,63 @@ +if($ENV{LIBRARY_PATH}) + string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) +endif() + +if(MGE_CUDA_USE_STATIC) + find_library(TRT_LIBRARY + NAMES libnvinfer_static.a libnvinfer_static.lib + PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} + HINTS ${SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "TRT library." ) +else() + find_library(TRT_LIBRARY + NAMES libnvinfer.so libnvinfer.dylib + PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} + HINTS ${SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES lib lib64 + DOC "TRT library." ) +endif() + +if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") + message(FATAL_ERROR "Can not find TensorRT Library") +endif() + +get_filename_component(__found_trt_root ${TRT_LIBRARY}/../.. REALPATH) +find_path(TRT_INCLUDE_DIR + NAMES NvInfer.h + HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root} + PATH_SUFFIXES include + DOC "Path to TRT include directory." ) + +if(TRT_INCLUDE_DIR STREQUAL "TRT_INCLUDE_DIR-NOTFOUND") + message(FATAL_ERROR "Can not find TensorRT Library") +endif() + +file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") +file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") +file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") + +if (TensorRT_MAJOR STREQUAL "") + file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") + file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") + file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") +endif() + +string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") +string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}") +string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}") +set(TRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}") + +if(MGE_CUDA_USE_STATIC) + add_library(libnvinfer STATIC IMPORTED) +else() + add_library(libnvinfer SHARED IMPORTED) +endif() + +set_target_properties(libnvinfer PROPERTIES + IMPORTED_LOCATION ${TRT_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${TRT_INCLUDE_DIR} +) + +message("-- Found TensorRT: ${__found_trt_root} (found version: ${TRT_VERSION_STRING})") + diff --git a/cmake/zmq.cmake b/cmake/zmq.cmake new file mode 100644 index 00000000..92a90bac --- /dev/null +++ b/cmake/zmq.cmake @@ -0,0 +1,25 @@ +include(ExternalProject) +include(GNUInstallDirs) + +set(ZMQ_DIR ${PROJECT_SOURCE_DIR}/third_party/libzmq CACHE STRING "ZMQ directory") +set(ZMQ_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/libzmq) +set(ZMQ_LIB ${ZMQ_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libzmq.a) + +ExternalProject_add( + zmq + SOURCE_DIR ${ZMQ_DIR} + PREFIX ${ZMQ_BUILD_DIR} + CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_INSTALL_PREFIX=${ZMQ_BUILD_DIR} -DWITH_PERF_TOOL=OFF -DZMQ_BUILD_TESTS=OFF -DENABLE_CPACK=OFF -DENABLE_CURVE=OFF + BUILD_BYPRODUCTS ${ZMQ_LIB} +) + +set(ZMQ_INC ${ZMQ_BUILD_DIR}/include) +file(MAKE_DIRECTORY ${ZMQ_INC}) + +add_library(libzmq STATIC IMPORTED GLOBAL) +add_dependencies(libzmq zmq) +set_target_properties( + libzmq PROPERTIES + IMPORTED_LOCATION ${ZMQ_LIB} + INTERFACE_INCLUDE_DIRECTORIES ${ZMQ_INC} +) diff --git a/dnn/CMakeLists.txt b/dnn/CMakeLists.txt new file mode 100644 index 00000000..fa9dcd84 --- /dev/null +++ b/dnn/CMakeLists.txt @@ -0,0 +1,97 @@ +if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + if(${MGE_BLAS} STREQUAL "MKL") + add_definitions(-DMEGDNN_X86_WITH_MKL) + elseif(${MGE_BLAS} STREQUAL "OpenBLAS") + add_definitions(-DMEGDNN_X86_WITH_OPENBLAS) + endif() +endif() + +# Enable Naive +if(${MGE_ARCH} STREQUAL "naive") + add_definitions(-DMEGDNN_NAIVE=1) + message(WARNING "MEGDNN_NAIVE is enabled; MegDNN performance is degraded.") +else() + add_definitions(-DMEGDNN_NAIVE=0) +endif() + + +if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + add_definitions(-DMEGDNN_X86=1) + if(${MGE_ARCH} STREQUAL "x86_64") + add_definitions(-DMEGDNN_X86_64 -DMEGDNN_64_BIT) + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") + endif() + else() + add_definitions(-DMEGDNN_X86_32) + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") + endif() + endif() + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mfpmath=sse") + endif() +endif() + + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}") + +list(APPEND OPR_PARAM_DEFS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/opr_param_defs.py) +set(OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/scripts/gen_param_defs.py) + +set(OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/include/) +file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/megdnn) +add_custom_command( + OUTPUT + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h + COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} + /dev/null --write-cppjson ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h + DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT} + VERBATIM +) + +list(APPEND OPR_PARAM_DEFS_OUTS + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h + ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h +) +list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR}) + +set(OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) +file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/src/common) +add_custom_command( + OUTPUT + ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh + COMMAND ${PYTHON_EXECUTABLE} ${OPR_PARAM_DEFS_SCRIPT} + --enumv ${OPR_PARAM_DEFS_SRCS} + ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh + DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT} + VERBATIM +) + +list(APPEND OPR_PARAM_DEFS_OUTS + ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh +) +list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR}) + + +install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn DESTINATION include FILES_MATCHING PATTERN "*.h") + +add_custom_target(_opr_param_defs DEPENDS ${OPR_PARAM_DEFS_OUTS}) +add_library(opr_param_defs INTERFACE) +target_include_directories(opr_param_defs INTERFACE ${OPR_PARAM_DEFS_INC}) +add_dependencies(opr_param_defs _opr_param_defs) + + + + + +if(MGE_WITH_TEST) + # use multi threads + add_definitions (-DMEGDNN_ENABLE_MULTI_THREADS=1) + add_subdirectory(test) +endif() + +add_subdirectory(src) diff --git a/dnn/cuda-stub/CMakeLists.txt b/dnn/cuda-stub/CMakeLists.txt new file mode 100644 index 00000000..090e8509 --- /dev/null +++ b/dnn/cuda-stub/CMakeLists.txt @@ -0,0 +1,6 @@ +file (GLOB_RECURSE SOURCES src/*.cpp) + +add_library (cuda-stub SHARED ${SOURCES}) +set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) +target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) +target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) diff --git a/dnn/cuda-stub/src/libcuda-wrap.h b/dnn/cuda-stub/src/libcuda-wrap.h new file mode 100644 index 00000000..a0954ad6 --- /dev/null +++ b/dnn/cuda-stub/src/libcuda-wrap.h @@ -0,0 +1,5054 @@ +// generated by wraplib.py +// --- begin functions to be implemented +#ifndef _WRAPLIB_API_CALL +#define _WRAPLIB_API_CALL +#endif +#ifndef _WRAPLIB_CALLBACK +#define _WRAPLIB_CALLBACK +#endif +#ifndef ON_ENTRY +#define ON_ENTRY(x) +#endif +static void* get_library_handle(); +static void* resolve_library_func(void* , const char*); +namespace { +template T on_init_failed(int func_idx); +} +// --- end functions to be implemented +#include +#include +static void load_library(); +static CUresult _WRAPLIB_API_CALL cuGetErrorString_init(CUresult arg0, const char **arg1) { + load_library(); + return cuGetErrorString(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuGetErrorString_error(CUresult, const char **) { + return on_init_failed(0); +} +static CUresult _WRAPLIB_API_CALL cuGetErrorName_init(CUresult arg0, const char **arg1) { + load_library(); + return cuGetErrorName(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuGetErrorName_error(CUresult, const char **) { + return on_init_failed(1); +} +static CUresult _WRAPLIB_API_CALL cuInit_init(unsigned int arg0) { + load_library(); + return cuInit(arg0); +} +static CUresult _WRAPLIB_API_CALL cuInit_error(unsigned int) { + return on_init_failed(2); +} +static CUresult _WRAPLIB_API_CALL cuDriverGetVersion_init(int *arg0) { + load_library(); + return cuDriverGetVersion(arg0); +} +static CUresult _WRAPLIB_API_CALL cuDriverGetVersion_error(int *) { + return on_init_failed(3); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGet_init(CUdevice *arg0, int arg1) { + load_library(); + return cuDeviceGet(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGet_error(CUdevice *, int) { + return on_init_failed(4); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetCount_init(int *arg0) { + load_library(); + return cuDeviceGetCount(arg0); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetCount_error(int *) { + return on_init_failed(5); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetName_init(char *arg0, int arg1, CUdevice arg2) { + load_library(); + return cuDeviceGetName(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetName_error(char *, int, CUdevice) { + return on_init_failed(6); +} +static CUresult _WRAPLIB_API_CALL cuDeviceTotalMem_v2_init(size_t *arg0, CUdevice arg1) { + load_library(); + return cuDeviceTotalMem_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDeviceTotalMem_v2_error(size_t *, CUdevice) { + return on_init_failed(7); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetAttribute_init(int *arg0, CUdevice_attribute arg1, CUdevice arg2) { + load_library(); + return cuDeviceGetAttribute(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetAttribute_error(int *, CUdevice_attribute, CUdevice) { + return on_init_failed(8); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetProperties_init(CUdevprop *arg0, CUdevice arg1) { + load_library(); + return cuDeviceGetProperties(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetProperties_error(CUdevprop *, CUdevice) { + return on_init_failed(9); +} +static CUresult _WRAPLIB_API_CALL cuDeviceComputeCapability_init(int *arg0, int *arg1, CUdevice arg2) { + load_library(); + return cuDeviceComputeCapability(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuDeviceComputeCapability_error(int *, int *, CUdevice) { + return on_init_failed(10); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxRetain_init(CUcontext *arg0, CUdevice arg1) { + load_library(); + return cuDevicePrimaryCtxRetain(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxRetain_error(CUcontext *, CUdevice) { + return on_init_failed(11); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxRelease_init(CUdevice arg0) { + load_library(); + return cuDevicePrimaryCtxRelease(arg0); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxRelease_error(CUdevice) { + return on_init_failed(12); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxSetFlags_init(CUdevice arg0, unsigned int arg1) { + load_library(); + return cuDevicePrimaryCtxSetFlags(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxSetFlags_error(CUdevice, unsigned int) { + return on_init_failed(13); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxGetState_init(CUdevice arg0, unsigned int *arg1, int *arg2) { + load_library(); + return cuDevicePrimaryCtxGetState(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxGetState_error(CUdevice, unsigned int *, int *) { + return on_init_failed(14); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxReset_init(CUdevice arg0) { + load_library(); + return cuDevicePrimaryCtxReset(arg0); +} +static CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxReset_error(CUdevice) { + return on_init_failed(15); +} +static CUresult _WRAPLIB_API_CALL cuCtxCreate_v2_init(CUcontext *arg0, unsigned int arg1, CUdevice arg2) { + load_library(); + return cuCtxCreate_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuCtxCreate_v2_error(CUcontext *, unsigned int, CUdevice) { + return on_init_failed(16); +} +static CUresult _WRAPLIB_API_CALL cuCtxDestroy_v2_init(CUcontext arg0) { + load_library(); + return cuCtxDestroy_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxDestroy_v2_error(CUcontext) { + return on_init_failed(17); +} +static CUresult _WRAPLIB_API_CALL cuCtxPushCurrent_v2_init(CUcontext arg0) { + load_library(); + return cuCtxPushCurrent_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxPushCurrent_v2_error(CUcontext) { + return on_init_failed(18); +} +static CUresult _WRAPLIB_API_CALL cuCtxPopCurrent_v2_init(CUcontext *arg0) { + load_library(); + return cuCtxPopCurrent_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxPopCurrent_v2_error(CUcontext *) { + return on_init_failed(19); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetCurrent_init(CUcontext arg0) { + load_library(); + return cuCtxSetCurrent(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetCurrent_error(CUcontext) { + return on_init_failed(20); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetCurrent_init(CUcontext *arg0) { + load_library(); + return cuCtxGetCurrent(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetCurrent_error(CUcontext *) { + return on_init_failed(21); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetDevice_init(CUdevice *arg0) { + load_library(); + return cuCtxGetDevice(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetDevice_error(CUdevice *) { + return on_init_failed(22); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetFlags_init(unsigned int *arg0) { + load_library(); + return cuCtxGetFlags(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetFlags_error(unsigned int *) { + return on_init_failed(23); +} +static CUresult _WRAPLIB_API_CALL cuCtxSynchronize_init() { + load_library(); + return cuCtxSynchronize(); +} +static CUresult _WRAPLIB_API_CALL cuCtxSynchronize_error() { + return on_init_failed(24); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetLimit_init(CUlimit arg0, size_t arg1) { + load_library(); + return cuCtxSetLimit(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetLimit_error(CUlimit, size_t) { + return on_init_failed(25); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetLimit_init(size_t *arg0, CUlimit arg1) { + load_library(); + return cuCtxGetLimit(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetLimit_error(size_t *, CUlimit) { + return on_init_failed(26); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetCacheConfig_init(CUfunc_cache *arg0) { + load_library(); + return cuCtxGetCacheConfig(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetCacheConfig_error(CUfunc_cache *) { + return on_init_failed(27); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetCacheConfig_init(CUfunc_cache arg0) { + load_library(); + return cuCtxSetCacheConfig(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetCacheConfig_error(CUfunc_cache) { + return on_init_failed(28); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetSharedMemConfig_init(CUsharedconfig *arg0) { + load_library(); + return cuCtxGetSharedMemConfig(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetSharedMemConfig_error(CUsharedconfig *) { + return on_init_failed(29); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetSharedMemConfig_init(CUsharedconfig arg0) { + load_library(); + return cuCtxSetSharedMemConfig(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxSetSharedMemConfig_error(CUsharedconfig) { + return on_init_failed(30); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetApiVersion_init(CUcontext arg0, unsigned int *arg1) { + load_library(); + return cuCtxGetApiVersion(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetApiVersion_error(CUcontext, unsigned int *) { + return on_init_failed(31); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetStreamPriorityRange_init(int *arg0, int *arg1) { + load_library(); + return cuCtxGetStreamPriorityRange(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuCtxGetStreamPriorityRange_error(int *, int *) { + return on_init_failed(32); +} +static CUresult _WRAPLIB_API_CALL cuCtxAttach_init(CUcontext *arg0, unsigned int arg1) { + load_library(); + return cuCtxAttach(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuCtxAttach_error(CUcontext *, unsigned int) { + return on_init_failed(33); +} +static CUresult _WRAPLIB_API_CALL cuCtxDetach_init(CUcontext arg0) { + load_library(); + return cuCtxDetach(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxDetach_error(CUcontext) { + return on_init_failed(34); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoad_init(CUmodule *arg0, const char *arg1) { + load_library(); + return cuModuleLoad(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoad_error(CUmodule *, const char *) { + return on_init_failed(35); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoadData_init(CUmodule *arg0, const void *arg1) { + load_library(); + return cuModuleLoadData(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoadData_error(CUmodule *, const void *) { + return on_init_failed(36); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoadDataEx_init(CUmodule *arg0, const void *arg1, unsigned int arg2, CUjit_option *arg3, void **arg4) { + load_library(); + return cuModuleLoadDataEx(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoadDataEx_error(CUmodule *, const void *, unsigned int, CUjit_option *, void **) { + return on_init_failed(37); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoadFatBinary_init(CUmodule *arg0, const void *arg1) { + load_library(); + return cuModuleLoadFatBinary(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuModuleLoadFatBinary_error(CUmodule *, const void *) { + return on_init_failed(38); +} +static CUresult _WRAPLIB_API_CALL cuModuleUnload_init(CUmodule arg0) { + load_library(); + return cuModuleUnload(arg0); +} +static CUresult _WRAPLIB_API_CALL cuModuleUnload_error(CUmodule) { + return on_init_failed(39); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetFunction_init(CUfunction *arg0, CUmodule arg1, const char *arg2) { + load_library(); + return cuModuleGetFunction(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetFunction_error(CUfunction *, CUmodule, const char *) { + return on_init_failed(40); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetGlobal_v2_init(CUdeviceptr *arg0, size_t *arg1, CUmodule arg2, const char *arg3) { + load_library(); + return cuModuleGetGlobal_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetGlobal_v2_error(CUdeviceptr *, size_t *, CUmodule, const char *) { + return on_init_failed(41); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetTexRef_init(CUtexref *arg0, CUmodule arg1, const char *arg2) { + load_library(); + return cuModuleGetTexRef(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetTexRef_error(CUtexref *, CUmodule, const char *) { + return on_init_failed(42); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetSurfRef_init(CUsurfref *arg0, CUmodule arg1, const char *arg2) { + load_library(); + return cuModuleGetSurfRef(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetSurfRef_error(CUsurfref *, CUmodule, const char *) { + return on_init_failed(43); +} +static CUresult _WRAPLIB_API_CALL cuLinkCreate_v2_init(unsigned int arg0, CUjit_option *arg1, void **arg2, CUlinkState *arg3) { + load_library(); + return cuLinkCreate_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuLinkCreate_v2_error(unsigned int, CUjit_option *, void **, CUlinkState *) { + return on_init_failed(44); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddData_v2_init(CUlinkState arg0, CUjitInputType arg1, void *arg2, size_t arg3, const char *arg4, unsigned int arg5, CUjit_option *arg6, void **arg7) { + load_library(); + return cuLinkAddData_v2(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddData_v2_error(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **) { + return on_init_failed(45); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddFile_v2_init(CUlinkState arg0, CUjitInputType arg1, const char *arg2, unsigned int arg3, CUjit_option *arg4, void **arg5) { + load_library(); + return cuLinkAddFile_v2(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddFile_v2_error(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **) { + return on_init_failed(46); +} +static CUresult _WRAPLIB_API_CALL cuLinkComplete_init(CUlinkState arg0, void **arg1, size_t *arg2) { + load_library(); + return cuLinkComplete(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuLinkComplete_error(CUlinkState, void **, size_t *) { + return on_init_failed(47); +} +static CUresult _WRAPLIB_API_CALL cuLinkDestroy_init(CUlinkState arg0) { + load_library(); + return cuLinkDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuLinkDestroy_error(CUlinkState) { + return on_init_failed(48); +} +static CUresult _WRAPLIB_API_CALL cuMemGetInfo_v2_init(size_t *arg0, size_t *arg1) { + load_library(); + return cuMemGetInfo_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemGetInfo_v2_error(size_t *, size_t *) { + return on_init_failed(49); +} +static CUresult _WRAPLIB_API_CALL cuMemAlloc_v2_init(CUdeviceptr *arg0, size_t arg1) { + load_library(); + return cuMemAlloc_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemAlloc_v2_error(CUdeviceptr *, size_t) { + return on_init_failed(50); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocPitch_v2_init(CUdeviceptr *arg0, size_t *arg1, size_t arg2, size_t arg3, unsigned int arg4) { + load_library(); + return cuMemAllocPitch_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocPitch_v2_error(CUdeviceptr *, size_t *, size_t, size_t, unsigned int) { + return on_init_failed(51); +} +static CUresult _WRAPLIB_API_CALL cuMemFree_v2_init(CUdeviceptr arg0) { + load_library(); + return cuMemFree_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemFree_v2_error(CUdeviceptr) { + return on_init_failed(52); +} +static CUresult _WRAPLIB_API_CALL cuMemGetAddressRange_v2_init(CUdeviceptr *arg0, size_t *arg1, CUdeviceptr arg2) { + load_library(); + return cuMemGetAddressRange_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemGetAddressRange_v2_error(CUdeviceptr *, size_t *, CUdeviceptr) { + return on_init_failed(53); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocHost_v2_init(void **arg0, size_t arg1) { + load_library(); + return cuMemAllocHost_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocHost_v2_error(void **, size_t) { + return on_init_failed(54); +} +static CUresult _WRAPLIB_API_CALL cuMemFreeHost_init(void *arg0) { + load_library(); + return cuMemFreeHost(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemFreeHost_error(void *) { + return on_init_failed(55); +} +static CUresult _WRAPLIB_API_CALL cuMemHostAlloc_init(void **arg0, size_t arg1, unsigned int arg2) { + load_library(); + return cuMemHostAlloc(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemHostAlloc_error(void **, size_t, unsigned int) { + return on_init_failed(56); +} +static CUresult _WRAPLIB_API_CALL cuMemHostGetDevicePointer_v2_init(CUdeviceptr *arg0, void *arg1, unsigned int arg2) { + load_library(); + return cuMemHostGetDevicePointer_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemHostGetDevicePointer_v2_error(CUdeviceptr *, void *, unsigned int) { + return on_init_failed(57); +} +static CUresult _WRAPLIB_API_CALL cuMemHostGetFlags_init(unsigned int *arg0, void *arg1) { + load_library(); + return cuMemHostGetFlags(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemHostGetFlags_error(unsigned int *, void *) { + return on_init_failed(58); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocManaged_init(CUdeviceptr *arg0, size_t arg1, unsigned int arg2) { + load_library(); + return cuMemAllocManaged(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocManaged_error(CUdeviceptr *, size_t, unsigned int) { + return on_init_failed(59); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetByPCIBusId_init(CUdevice *arg0, const char *arg1) { + load_library(); + return cuDeviceGetByPCIBusId(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetByPCIBusId_error(CUdevice *, const char *) { + return on_init_failed(60); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetPCIBusId_init(char *arg0, int arg1, CUdevice arg2) { + load_library(); + return cuDeviceGetPCIBusId(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetPCIBusId_error(char *, int, CUdevice) { + return on_init_failed(61); +} +static CUresult _WRAPLIB_API_CALL cuIpcGetEventHandle_init(CUipcEventHandle *arg0, CUevent arg1) { + load_library(); + return cuIpcGetEventHandle(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuIpcGetEventHandle_error(CUipcEventHandle *, CUevent) { + return on_init_failed(62); +} +static CUresult _WRAPLIB_API_CALL cuIpcOpenEventHandle_init(CUevent *arg0, CUipcEventHandle arg1) { + load_library(); + return cuIpcOpenEventHandle(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuIpcOpenEventHandle_error(CUevent *, CUipcEventHandle) { + return on_init_failed(63); +} +static CUresult _WRAPLIB_API_CALL cuIpcGetMemHandle_init(CUipcMemHandle *arg0, CUdeviceptr arg1) { + load_library(); + return cuIpcGetMemHandle(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuIpcGetMemHandle_error(CUipcMemHandle *, CUdeviceptr) { + return on_init_failed(64); +} +static CUresult _WRAPLIB_API_CALL cuIpcOpenMemHandle_init(CUdeviceptr *arg0, CUipcMemHandle arg1, unsigned int arg2) { + load_library(); + return cuIpcOpenMemHandle(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuIpcOpenMemHandle_error(CUdeviceptr *, CUipcMemHandle, unsigned int) { + return on_init_failed(65); +} +static CUresult _WRAPLIB_API_CALL cuIpcCloseMemHandle_init(CUdeviceptr arg0) { + load_library(); + return cuIpcCloseMemHandle(arg0); +} +static CUresult _WRAPLIB_API_CALL cuIpcCloseMemHandle_error(CUdeviceptr) { + return on_init_failed(66); +} +static CUresult _WRAPLIB_API_CALL cuMemHostRegister_v2_init(void *arg0, size_t arg1, unsigned int arg2) { + load_library(); + return cuMemHostRegister_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemHostRegister_v2_error(void *, size_t, unsigned int) { + return on_init_failed(67); +} +static CUresult _WRAPLIB_API_CALL cuMemHostUnregister_init(void *arg0) { + load_library(); + return cuMemHostUnregister(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemHostUnregister_error(void *) { + return on_init_failed(68); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy_ptds_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + load_library(); + return cuMemcpy_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy_ptds_error(CUdeviceptr, CUdeviceptr, size_t) { + return on_init_failed(69); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeer_ptds_init(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4) { + load_library(); + return cuMemcpyPeer_ptds(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeer_ptds_error(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t) { + return on_init_failed(70); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_v2_ptds_init(CUdeviceptr arg0, const void *arg1, size_t arg2) { + load_library(); + return cuMemcpyHtoD_v2_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_v2_ptds_error(CUdeviceptr, const void *, size_t) { + return on_init_failed(71); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_v2_ptds_init(void *arg0, CUdeviceptr arg1, size_t arg2) { + load_library(); + return cuMemcpyDtoH_v2_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_v2_ptds_error(void *, CUdeviceptr, size_t) { + return on_init_failed(72); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_v2_ptds_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + load_library(); + return cuMemcpyDtoD_v2_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_v2_ptds_error(CUdeviceptr, CUdeviceptr, size_t) { + return on_init_failed(73); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_v2_ptds_init(CUarray arg0, size_t arg1, CUdeviceptr arg2, size_t arg3) { + load_library(); + return cuMemcpyDtoA_v2_ptds(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_v2_ptds_error(CUarray, size_t, CUdeviceptr, size_t) { + return on_init_failed(74); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_v2_ptds_init(CUdeviceptr arg0, CUarray arg1, size_t arg2, size_t arg3) { + load_library(); + return cuMemcpyAtoD_v2_ptds(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_v2_ptds_error(CUdeviceptr, CUarray, size_t, size_t) { + return on_init_failed(75); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_v2_ptds_init(CUarray arg0, size_t arg1, const void *arg2, size_t arg3) { + load_library(); + return cuMemcpyHtoA_v2_ptds(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_v2_ptds_error(CUarray, size_t, const void *, size_t) { + return on_init_failed(76); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_v2_ptds_init(void *arg0, CUarray arg1, size_t arg2, size_t arg3) { + load_library(); + return cuMemcpyAtoH_v2_ptds(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_v2_ptds_error(void *, CUarray, size_t, size_t) { + return on_init_failed(77); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_v2_ptds_init(CUarray arg0, size_t arg1, CUarray arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemcpyAtoA_v2_ptds(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_v2_ptds_error(CUarray, size_t, CUarray, size_t, size_t) { + return on_init_failed(78); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2D_v2_ptds_init(const CUDA_MEMCPY2D *arg0) { + load_library(); + return cuMemcpy2D_v2_ptds(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2D_v2_ptds_error(const CUDA_MEMCPY2D *) { + return on_init_failed(79); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_v2_ptds_init(const CUDA_MEMCPY2D *arg0) { + load_library(); + return cuMemcpy2DUnaligned_v2_ptds(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_v2_ptds_error(const CUDA_MEMCPY2D *) { + return on_init_failed(80); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3D_v2_ptds_init(const CUDA_MEMCPY3D *arg0) { + load_library(); + return cuMemcpy3D_v2_ptds(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3D_v2_ptds_error(const CUDA_MEMCPY3D *) { + return on_init_failed(81); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeer_ptds_init(const CUDA_MEMCPY3D_PEER *arg0) { + load_library(); + return cuMemcpy3DPeer_ptds(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeer_ptds_error(const CUDA_MEMCPY3D_PEER *) { + return on_init_failed(82); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAsync_ptsz_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyAsync_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAsync_ptsz_error(CUdeviceptr, CUdeviceptr, size_t, CUstream) { + return on_init_failed(83); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeerAsync_ptsz_init(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemcpyPeerAsync_ptsz(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeerAsync_ptsz_error(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream) { + return on_init_failed(84); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_v2_ptsz_init(CUdeviceptr arg0, const void *arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyHtoDAsync_v2_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_v2_ptsz_error(CUdeviceptr, const void *, size_t, CUstream) { + return on_init_failed(85); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_v2_ptsz_init(void *arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyDtoHAsync_v2_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_v2_ptsz_error(void *, CUdeviceptr, size_t, CUstream) { + return on_init_failed(86); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_v2_ptsz_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyDtoDAsync_v2_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_v2_ptsz_error(CUdeviceptr, CUdeviceptr, size_t, CUstream) { + return on_init_failed(87); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_v2_ptsz_init(CUarray arg0, size_t arg1, const void *arg2, size_t arg3, CUstream arg4) { + load_library(); + return cuMemcpyHtoAAsync_v2_ptsz(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_v2_ptsz_error(CUarray, size_t, const void *, size_t, CUstream) { + return on_init_failed(88); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_v2_ptsz_init(void *arg0, CUarray arg1, size_t arg2, size_t arg3, CUstream arg4) { + load_library(); + return cuMemcpyAtoHAsync_v2_ptsz(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_v2_ptsz_error(void *, CUarray, size_t, size_t, CUstream) { + return on_init_failed(89); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_v2_ptsz_init(const CUDA_MEMCPY2D *arg0, CUstream arg1) { + load_library(); + return cuMemcpy2DAsync_v2_ptsz(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_v2_ptsz_error(const CUDA_MEMCPY2D *, CUstream) { + return on_init_failed(90); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_v2_ptsz_init(const CUDA_MEMCPY3D *arg0, CUstream arg1) { + load_library(); + return cuMemcpy3DAsync_v2_ptsz(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_v2_ptsz_error(const CUDA_MEMCPY3D *, CUstream) { + return on_init_failed(91); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeerAsync_ptsz_init(const CUDA_MEMCPY3D_PEER *arg0, CUstream arg1) { + load_library(); + return cuMemcpy3DPeerAsync_ptsz(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeerAsync_ptsz_error(const CUDA_MEMCPY3D_PEER *, CUstream) { + return on_init_failed(92); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8_v2_ptds_init(CUdeviceptr arg0, unsigned char arg1, size_t arg2) { + load_library(); + return cuMemsetD8_v2_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8_v2_ptds_error(CUdeviceptr, unsigned char, size_t) { + return on_init_failed(93); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16_v2_ptds_init(CUdeviceptr arg0, unsigned short arg1, size_t arg2) { + load_library(); + return cuMemsetD16_v2_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16_v2_ptds_error(CUdeviceptr, unsigned short, size_t) { + return on_init_failed(94); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32_v2_ptds_init(CUdeviceptr arg0, unsigned int arg1, size_t arg2) { + load_library(); + return cuMemsetD32_v2_ptds(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32_v2_ptds_error(CUdeviceptr, unsigned int, size_t) { + return on_init_failed(95); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8_v2_ptds_init(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemsetD2D8_v2_ptds(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8_v2_ptds_error(CUdeviceptr, size_t, unsigned char, size_t, size_t) { + return on_init_failed(96); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16_v2_ptds_init(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemsetD2D16_v2_ptds(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16_v2_ptds_error(CUdeviceptr, size_t, unsigned short, size_t, size_t) { + return on_init_failed(97); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32_v2_ptds_init(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemsetD2D32_v2_ptds(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32_v2_ptds_error(CUdeviceptr, size_t, unsigned int, size_t, size_t) { + return on_init_failed(98); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8Async_ptsz_init(CUdeviceptr arg0, unsigned char arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemsetD8Async_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8Async_ptsz_error(CUdeviceptr, unsigned char, size_t, CUstream) { + return on_init_failed(99); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16Async_ptsz_init(CUdeviceptr arg0, unsigned short arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemsetD16Async_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16Async_ptsz_error(CUdeviceptr, unsigned short, size_t, CUstream) { + return on_init_failed(100); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32Async_ptsz_init(CUdeviceptr arg0, unsigned int arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemsetD32Async_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32Async_ptsz_error(CUdeviceptr, unsigned int, size_t, CUstream) { + return on_init_failed(101); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8Async_ptsz_init(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemsetD2D8Async_ptsz(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8Async_ptsz_error(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream) { + return on_init_failed(102); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16Async_ptsz_init(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemsetD2D16Async_ptsz(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16Async_ptsz_error(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream) { + return on_init_failed(103); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32Async_ptsz_init(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemsetD2D32Async_ptsz(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32Async_ptsz_error(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream) { + return on_init_failed(104); +} +static CUresult _WRAPLIB_API_CALL cuArrayCreate_v2_init(CUarray *arg0, const CUDA_ARRAY_DESCRIPTOR *arg1) { + load_library(); + return cuArrayCreate_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArrayCreate_v2_error(CUarray *, const CUDA_ARRAY_DESCRIPTOR *) { + return on_init_failed(105); +} +static CUresult _WRAPLIB_API_CALL cuArrayGetDescriptor_v2_init(CUDA_ARRAY_DESCRIPTOR *arg0, CUarray arg1) { + load_library(); + return cuArrayGetDescriptor_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArrayGetDescriptor_v2_error(CUDA_ARRAY_DESCRIPTOR *, CUarray) { + return on_init_failed(106); +} +static CUresult _WRAPLIB_API_CALL cuArrayDestroy_init(CUarray arg0) { + load_library(); + return cuArrayDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuArrayDestroy_error(CUarray) { + return on_init_failed(107); +} +static CUresult _WRAPLIB_API_CALL cuArray3DCreate_v2_init(CUarray *arg0, const CUDA_ARRAY3D_DESCRIPTOR *arg1) { + load_library(); + return cuArray3DCreate_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArray3DCreate_v2_error(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *) { + return on_init_failed(108); +} +static CUresult _WRAPLIB_API_CALL cuArray3DGetDescriptor_v2_init(CUDA_ARRAY3D_DESCRIPTOR *arg0, CUarray arg1) { + load_library(); + return cuArray3DGetDescriptor_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArray3DGetDescriptor_v2_error(CUDA_ARRAY3D_DESCRIPTOR *, CUarray) { + return on_init_failed(109); +} +static CUresult _WRAPLIB_API_CALL cuMipmappedArrayCreate_init(CUmipmappedArray *arg0, const CUDA_ARRAY3D_DESCRIPTOR *arg1, unsigned int arg2) { + load_library(); + return cuMipmappedArrayCreate(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMipmappedArrayCreate_error(CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int) { + return on_init_failed(110); +} +static CUresult _WRAPLIB_API_CALL cuMipmappedArrayGetLevel_init(CUarray *arg0, CUmipmappedArray arg1, unsigned int arg2) { + load_library(); + return cuMipmappedArrayGetLevel(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMipmappedArrayGetLevel_error(CUarray *, CUmipmappedArray, unsigned int) { + return on_init_failed(111); +} +static CUresult _WRAPLIB_API_CALL cuMipmappedArrayDestroy_init(CUmipmappedArray arg0) { + load_library(); + return cuMipmappedArrayDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMipmappedArrayDestroy_error(CUmipmappedArray) { + return on_init_failed(112); +} +static CUresult _WRAPLIB_API_CALL cuPointerGetAttribute_init(void *arg0, CUpointer_attribute arg1, CUdeviceptr arg2) { + load_library(); + return cuPointerGetAttribute(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuPointerGetAttribute_error(void *, CUpointer_attribute, CUdeviceptr) { + return on_init_failed(113); +} +static CUresult _WRAPLIB_API_CALL cuMemPrefetchAsync_ptsz_init(CUdeviceptr arg0, size_t arg1, CUdevice arg2, CUstream arg3) { + load_library(); + return cuMemPrefetchAsync_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemPrefetchAsync_ptsz_error(CUdeviceptr, size_t, CUdevice, CUstream) { + return on_init_failed(114); +} +static CUresult _WRAPLIB_API_CALL cuMemAdvise_init(CUdeviceptr arg0, size_t arg1, CUmem_advise arg2, CUdevice arg3) { + load_library(); + return cuMemAdvise(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemAdvise_error(CUdeviceptr, size_t, CUmem_advise, CUdevice) { + return on_init_failed(115); +} +static CUresult _WRAPLIB_API_CALL cuMemRangeGetAttribute_init(void *arg0, size_t arg1, CUmem_range_attribute arg2, CUdeviceptr arg3, size_t arg4) { + load_library(); + return cuMemRangeGetAttribute(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemRangeGetAttribute_error(void *, size_t, CUmem_range_attribute, CUdeviceptr, size_t) { + return on_init_failed(116); +} +static CUresult _WRAPLIB_API_CALL cuMemRangeGetAttributes_init(void **arg0, size_t *arg1, CUmem_range_attribute *arg2, size_t arg3, CUdeviceptr arg4, size_t arg5) { + load_library(); + return cuMemRangeGetAttributes(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemRangeGetAttributes_error(void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t) { + return on_init_failed(117); +} +static CUresult _WRAPLIB_API_CALL cuPointerSetAttribute_init(const void *arg0, CUpointer_attribute arg1, CUdeviceptr arg2) { + load_library(); + return cuPointerSetAttribute(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuPointerSetAttribute_error(const void *, CUpointer_attribute, CUdeviceptr) { + return on_init_failed(118); +} +static CUresult _WRAPLIB_API_CALL cuPointerGetAttributes_init(unsigned int arg0, CUpointer_attribute *arg1, void **arg2, CUdeviceptr arg3) { + load_library(); + return cuPointerGetAttributes(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuPointerGetAttributes_error(unsigned int, CUpointer_attribute *, void **, CUdeviceptr) { + return on_init_failed(119); +} +static CUresult _WRAPLIB_API_CALL cuStreamCreate_init(CUstream *arg0, unsigned int arg1) { + load_library(); + return cuStreamCreate(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuStreamCreate_error(CUstream *, unsigned int) { + return on_init_failed(120); +} +static CUresult _WRAPLIB_API_CALL cuStreamCreateWithPriority_init(CUstream *arg0, unsigned int arg1, int arg2) { + load_library(); + return cuStreamCreateWithPriority(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuStreamCreateWithPriority_error(CUstream *, unsigned int, int) { + return on_init_failed(121); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetPriority_ptsz_init(CUstream arg0, int *arg1) { + load_library(); + return cuStreamGetPriority_ptsz(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetPriority_ptsz_error(CUstream, int *) { + return on_init_failed(122); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetFlags_ptsz_init(CUstream arg0, unsigned int *arg1) { + load_library(); + return cuStreamGetFlags_ptsz(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetFlags_ptsz_error(CUstream, unsigned int *) { + return on_init_failed(123); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitEvent_ptsz_init(CUstream arg0, CUevent arg1, unsigned int arg2) { + load_library(); + return cuStreamWaitEvent_ptsz(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitEvent_ptsz_error(CUstream, CUevent, unsigned int) { + return on_init_failed(124); +} +static CUresult _WRAPLIB_API_CALL cuStreamAddCallback_ptsz_init(CUstream arg0, CUstreamCallback arg1, void *arg2, unsigned int arg3) { + load_library(); + return cuStreamAddCallback_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamAddCallback_ptsz_error(CUstream, CUstreamCallback, void *, unsigned int) { + return on_init_failed(125); +} +static CUresult _WRAPLIB_API_CALL cuStreamAttachMemAsync_ptsz_init(CUstream arg0, CUdeviceptr arg1, size_t arg2, unsigned int arg3) { + load_library(); + return cuStreamAttachMemAsync_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamAttachMemAsync_ptsz_error(CUstream, CUdeviceptr, size_t, unsigned int) { + return on_init_failed(126); +} +static CUresult _WRAPLIB_API_CALL cuStreamQuery_ptsz_init(CUstream arg0) { + load_library(); + return cuStreamQuery_ptsz(arg0); +} +static CUresult _WRAPLIB_API_CALL cuStreamQuery_ptsz_error(CUstream) { + return on_init_failed(127); +} +static CUresult _WRAPLIB_API_CALL cuStreamSynchronize_ptsz_init(CUstream arg0) { + load_library(); + return cuStreamSynchronize_ptsz(arg0); +} +static CUresult _WRAPLIB_API_CALL cuStreamSynchronize_ptsz_error(CUstream) { + return on_init_failed(128); +} +static CUresult _WRAPLIB_API_CALL cuStreamDestroy_v2_init(CUstream arg0) { + load_library(); + return cuStreamDestroy_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuStreamDestroy_v2_error(CUstream) { + return on_init_failed(129); +} +static CUresult _WRAPLIB_API_CALL cuEventCreate_init(CUevent *arg0, unsigned int arg1) { + load_library(); + return cuEventCreate(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuEventCreate_error(CUevent *, unsigned int) { + return on_init_failed(130); +} +static CUresult _WRAPLIB_API_CALL cuEventRecord_ptsz_init(CUevent arg0, CUstream arg1) { + load_library(); + return cuEventRecord_ptsz(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuEventRecord_ptsz_error(CUevent, CUstream) { + return on_init_failed(131); +} +static CUresult _WRAPLIB_API_CALL cuEventQuery_init(CUevent arg0) { + load_library(); + return cuEventQuery(arg0); +} +static CUresult _WRAPLIB_API_CALL cuEventQuery_error(CUevent) { + return on_init_failed(132); +} +static CUresult _WRAPLIB_API_CALL cuEventSynchronize_init(CUevent arg0) { + load_library(); + return cuEventSynchronize(arg0); +} +static CUresult _WRAPLIB_API_CALL cuEventSynchronize_error(CUevent) { + return on_init_failed(133); +} +static CUresult _WRAPLIB_API_CALL cuEventDestroy_v2_init(CUevent arg0) { + load_library(); + return cuEventDestroy_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuEventDestroy_v2_error(CUevent) { + return on_init_failed(134); +} +static CUresult _WRAPLIB_API_CALL cuEventElapsedTime_init(float *arg0, CUevent arg1, CUevent arg2) { + load_library(); + return cuEventElapsedTime(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuEventElapsedTime_error(float *, CUevent, CUevent) { + return on_init_failed(135); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitValue32_ptsz_init(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + load_library(); + return cuStreamWaitValue32_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitValue32_ptsz_error(CUstream, CUdeviceptr, cuuint32_t, unsigned int) { + return on_init_failed(136); +} +static CUresult _WRAPLIB_API_CALL cuStreamWriteValue32_ptsz_init(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + load_library(); + return cuStreamWriteValue32_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamWriteValue32_ptsz_error(CUstream, CUdeviceptr, cuuint32_t, unsigned int) { + return on_init_failed(137); +} +static CUresult _WRAPLIB_API_CALL cuStreamBatchMemOp_ptsz_init(CUstream arg0, unsigned int arg1, CUstreamBatchMemOpParams *arg2, unsigned int arg3) { + load_library(); + return cuStreamBatchMemOp_ptsz(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamBatchMemOp_ptsz_error(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int) { + return on_init_failed(138); +} +static CUresult _WRAPLIB_API_CALL cuFuncGetAttribute_init(int *arg0, CUfunction_attribute arg1, CUfunction arg2) { + load_library(); + return cuFuncGetAttribute(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuFuncGetAttribute_error(int *, CUfunction_attribute, CUfunction) { + return on_init_failed(139); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetCacheConfig_init(CUfunction arg0, CUfunc_cache arg1) { + load_library(); + return cuFuncSetCacheConfig(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetCacheConfig_error(CUfunction, CUfunc_cache) { + return on_init_failed(140); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetSharedMemConfig_init(CUfunction arg0, CUsharedconfig arg1) { + load_library(); + return cuFuncSetSharedMemConfig(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetSharedMemConfig_error(CUfunction, CUsharedconfig) { + return on_init_failed(141); +} +static CUresult _WRAPLIB_API_CALL cuLaunchKernel_ptsz_init(CUfunction arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4, unsigned int arg5, unsigned int arg6, unsigned int arg7, CUstream arg8, void **arg9, void **arg10) { + load_library(); + return cuLaunchKernel_ptsz(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); +} +static CUresult _WRAPLIB_API_CALL cuLaunchKernel_ptsz_error(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **) { + return on_init_failed(142); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetBlockShape_init(CUfunction arg0, int arg1, int arg2, int arg3) { + load_library(); + return cuFuncSetBlockShape(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetBlockShape_error(CUfunction, int, int, int) { + return on_init_failed(143); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetSharedSize_init(CUfunction arg0, unsigned int arg1) { + load_library(); + return cuFuncSetSharedSize(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuFuncSetSharedSize_error(CUfunction, unsigned int) { + return on_init_failed(144); +} +static CUresult _WRAPLIB_API_CALL cuParamSetSize_init(CUfunction arg0, unsigned int arg1) { + load_library(); + return cuParamSetSize(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuParamSetSize_error(CUfunction, unsigned int) { + return on_init_failed(145); +} +static CUresult _WRAPLIB_API_CALL cuParamSeti_init(CUfunction arg0, int arg1, unsigned int arg2) { + load_library(); + return cuParamSeti(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuParamSeti_error(CUfunction, int, unsigned int) { + return on_init_failed(146); +} +static CUresult _WRAPLIB_API_CALL cuParamSetf_init(CUfunction arg0, int arg1, float arg2) { + load_library(); + return cuParamSetf(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuParamSetf_error(CUfunction, int, float) { + return on_init_failed(147); +} +static CUresult _WRAPLIB_API_CALL cuParamSetv_init(CUfunction arg0, int arg1, void *arg2, unsigned int arg3) { + load_library(); + return cuParamSetv(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuParamSetv_error(CUfunction, int, void *, unsigned int) { + return on_init_failed(148); +} +static CUresult _WRAPLIB_API_CALL cuLaunch_init(CUfunction arg0) { + load_library(); + return cuLaunch(arg0); +} +static CUresult _WRAPLIB_API_CALL cuLaunch_error(CUfunction) { + return on_init_failed(149); +} +static CUresult _WRAPLIB_API_CALL cuLaunchGrid_init(CUfunction arg0, int arg1, int arg2) { + load_library(); + return cuLaunchGrid(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuLaunchGrid_error(CUfunction, int, int) { + return on_init_failed(150); +} +static CUresult _WRAPLIB_API_CALL cuLaunchGridAsync_init(CUfunction arg0, int arg1, int arg2, CUstream arg3) { + load_library(); + return cuLaunchGridAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuLaunchGridAsync_error(CUfunction, int, int, CUstream) { + return on_init_failed(151); +} +static CUresult _WRAPLIB_API_CALL cuParamSetTexRef_init(CUfunction arg0, int arg1, CUtexref arg2) { + load_library(); + return cuParamSetTexRef(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuParamSetTexRef_error(CUfunction, int, CUtexref) { + return on_init_failed(152); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxActiveBlocksPerMultiprocessor_init(int *arg0, CUfunction arg1, int arg2, size_t arg3) { + load_library(); + return cuOccupancyMaxActiveBlocksPerMultiprocessor(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxActiveBlocksPerMultiprocessor_error(int *, CUfunction, int, size_t) { + return on_init_failed(153); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_init(int *arg0, CUfunction arg1, int arg2, size_t arg3, unsigned int arg4) { + load_library(); + return cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_error(int *, CUfunction, int, size_t, unsigned int) { + return on_init_failed(154); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxPotentialBlockSize_init(int *arg0, int *arg1, CUfunction arg2, CUoccupancyB2DSize arg3, size_t arg4, int arg5) { + load_library(); + return cuOccupancyMaxPotentialBlockSize(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxPotentialBlockSize_error(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int) { + return on_init_failed(155); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxPotentialBlockSizeWithFlags_init(int *arg0, int *arg1, CUfunction arg2, CUoccupancyB2DSize arg3, size_t arg4, int arg5, unsigned int arg6) { + load_library(); + return cuOccupancyMaxPotentialBlockSizeWithFlags(arg0, arg1, arg2, arg3, arg4, arg5, arg6); +} +static CUresult _WRAPLIB_API_CALL cuOccupancyMaxPotentialBlockSizeWithFlags_error(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int) { + return on_init_failed(156); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetArray_init(CUtexref arg0, CUarray arg1, unsigned int arg2) { + load_library(); + return cuTexRefSetArray(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetArray_error(CUtexref, CUarray, unsigned int) { + return on_init_failed(157); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmappedArray_init(CUtexref arg0, CUmipmappedArray arg1, unsigned int arg2) { + load_library(); + return cuTexRefSetMipmappedArray(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmappedArray_error(CUtexref, CUmipmappedArray, unsigned int) { + return on_init_failed(158); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress_v2_init(size_t *arg0, CUtexref arg1, CUdeviceptr arg2, size_t arg3) { + load_library(); + return cuTexRefSetAddress_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress_v2_error(size_t *, CUtexref, CUdeviceptr, size_t) { + return on_init_failed(159); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_v3_init(CUtexref arg0, const CUDA_ARRAY_DESCRIPTOR *arg1, CUdeviceptr arg2, size_t arg3) { + load_library(); + return cuTexRefSetAddress2D_v3(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_v3_error(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t) { + return on_init_failed(160); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetFormat_init(CUtexref arg0, CUarray_format arg1, int arg2) { + load_library(); + return cuTexRefSetFormat(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetFormat_error(CUtexref, CUarray_format, int) { + return on_init_failed(161); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddressMode_init(CUtexref arg0, int arg1, CUaddress_mode arg2) { + load_library(); + return cuTexRefSetAddressMode(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddressMode_error(CUtexref, int, CUaddress_mode) { + return on_init_failed(162); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetFilterMode_init(CUtexref arg0, CUfilter_mode arg1) { + load_library(); + return cuTexRefSetFilterMode(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetFilterMode_error(CUtexref, CUfilter_mode) { + return on_init_failed(163); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapFilterMode_init(CUtexref arg0, CUfilter_mode arg1) { + load_library(); + return cuTexRefSetMipmapFilterMode(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapFilterMode_error(CUtexref, CUfilter_mode) { + return on_init_failed(164); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapLevelBias_init(CUtexref arg0, float arg1) { + load_library(); + return cuTexRefSetMipmapLevelBias(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapLevelBias_error(CUtexref, float) { + return on_init_failed(165); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapLevelClamp_init(CUtexref arg0, float arg1, float arg2) { + load_library(); + return cuTexRefSetMipmapLevelClamp(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapLevelClamp_error(CUtexref, float, float) { + return on_init_failed(166); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMaxAnisotropy_init(CUtexref arg0, unsigned int arg1) { + load_library(); + return cuTexRefSetMaxAnisotropy(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetMaxAnisotropy_error(CUtexref, unsigned int) { + return on_init_failed(167); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetBorderColor_init(CUtexref arg0, float *arg1) { + load_library(); + return cuTexRefSetBorderColor(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetBorderColor_error(CUtexref, float *) { + return on_init_failed(168); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetFlags_init(CUtexref arg0, unsigned int arg1) { + load_library(); + return cuTexRefSetFlags(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetFlags_error(CUtexref, unsigned int) { + return on_init_failed(169); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetAddress_v2_init(CUdeviceptr *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetAddress_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetAddress_v2_error(CUdeviceptr *, CUtexref) { + return on_init_failed(170); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetArray_init(CUarray *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetArray(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetArray_error(CUarray *, CUtexref) { + return on_init_failed(171); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmappedArray_init(CUmipmappedArray *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetMipmappedArray(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmappedArray_error(CUmipmappedArray *, CUtexref) { + return on_init_failed(172); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetAddressMode_init(CUaddress_mode *arg0, CUtexref arg1, int arg2) { + load_library(); + return cuTexRefGetAddressMode(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetAddressMode_error(CUaddress_mode *, CUtexref, int) { + return on_init_failed(173); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetFilterMode_init(CUfilter_mode *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetFilterMode(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetFilterMode_error(CUfilter_mode *, CUtexref) { + return on_init_failed(174); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetFormat_init(CUarray_format *arg0, int *arg1, CUtexref arg2) { + load_library(); + return cuTexRefGetFormat(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetFormat_error(CUarray_format *, int *, CUtexref) { + return on_init_failed(175); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapFilterMode_init(CUfilter_mode *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetMipmapFilterMode(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapFilterMode_error(CUfilter_mode *, CUtexref) { + return on_init_failed(176); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapLevelBias_init(float *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetMipmapLevelBias(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapLevelBias_error(float *, CUtexref) { + return on_init_failed(177); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapLevelClamp_init(float *arg0, float *arg1, CUtexref arg2) { + load_library(); + return cuTexRefGetMipmapLevelClamp(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapLevelClamp_error(float *, float *, CUtexref) { + return on_init_failed(178); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMaxAnisotropy_init(int *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetMaxAnisotropy(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetMaxAnisotropy_error(int *, CUtexref) { + return on_init_failed(179); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetBorderColor_init(float *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetBorderColor(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetBorderColor_error(float *, CUtexref) { + return on_init_failed(180); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetFlags_init(unsigned int *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetFlags(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetFlags_error(unsigned int *, CUtexref) { + return on_init_failed(181); +} +static CUresult _WRAPLIB_API_CALL cuTexRefCreate_init(CUtexref *arg0) { + load_library(); + return cuTexRefCreate(arg0); +} +static CUresult _WRAPLIB_API_CALL cuTexRefCreate_error(CUtexref *) { + return on_init_failed(182); +} +static CUresult _WRAPLIB_API_CALL cuTexRefDestroy_init(CUtexref arg0) { + load_library(); + return cuTexRefDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuTexRefDestroy_error(CUtexref) { + return on_init_failed(183); +} +static CUresult _WRAPLIB_API_CALL cuSurfRefSetArray_init(CUsurfref arg0, CUarray arg1, unsigned int arg2) { + load_library(); + return cuSurfRefSetArray(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuSurfRefSetArray_error(CUsurfref, CUarray, unsigned int) { + return on_init_failed(184); +} +static CUresult _WRAPLIB_API_CALL cuSurfRefGetArray_init(CUarray *arg0, CUsurfref arg1) { + load_library(); + return cuSurfRefGetArray(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuSurfRefGetArray_error(CUarray *, CUsurfref) { + return on_init_failed(185); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectCreate_init(CUtexObject *arg0, const CUDA_RESOURCE_DESC *arg1, const CUDA_TEXTURE_DESC *arg2, const CUDA_RESOURCE_VIEW_DESC *arg3) { + load_library(); + return cuTexObjectCreate(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectCreate_error(CUtexObject *, const CUDA_RESOURCE_DESC *, const CUDA_TEXTURE_DESC *, const CUDA_RESOURCE_VIEW_DESC *) { + return on_init_failed(186); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectDestroy_init(CUtexObject arg0) { + load_library(); + return cuTexObjectDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectDestroy_error(CUtexObject) { + return on_init_failed(187); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectGetResourceDesc_init(CUDA_RESOURCE_DESC *arg0, CUtexObject arg1) { + load_library(); + return cuTexObjectGetResourceDesc(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectGetResourceDesc_error(CUDA_RESOURCE_DESC *, CUtexObject) { + return on_init_failed(188); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectGetTextureDesc_init(CUDA_TEXTURE_DESC *arg0, CUtexObject arg1) { + load_library(); + return cuTexObjectGetTextureDesc(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectGetTextureDesc_error(CUDA_TEXTURE_DESC *, CUtexObject) { + return on_init_failed(189); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectGetResourceViewDesc_init(CUDA_RESOURCE_VIEW_DESC *arg0, CUtexObject arg1) { + load_library(); + return cuTexObjectGetResourceViewDesc(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexObjectGetResourceViewDesc_error(CUDA_RESOURCE_VIEW_DESC *, CUtexObject) { + return on_init_failed(190); +} +static CUresult _WRAPLIB_API_CALL cuSurfObjectCreate_init(CUsurfObject *arg0, const CUDA_RESOURCE_DESC *arg1) { + load_library(); + return cuSurfObjectCreate(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuSurfObjectCreate_error(CUsurfObject *, const CUDA_RESOURCE_DESC *) { + return on_init_failed(191); +} +static CUresult _WRAPLIB_API_CALL cuSurfObjectDestroy_init(CUsurfObject arg0) { + load_library(); + return cuSurfObjectDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuSurfObjectDestroy_error(CUsurfObject) { + return on_init_failed(192); +} +static CUresult _WRAPLIB_API_CALL cuSurfObjectGetResourceDesc_init(CUDA_RESOURCE_DESC *arg0, CUsurfObject arg1) { + load_library(); + return cuSurfObjectGetResourceDesc(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuSurfObjectGetResourceDesc_error(CUDA_RESOURCE_DESC *, CUsurfObject) { + return on_init_failed(193); +} +static CUresult _WRAPLIB_API_CALL cuDeviceCanAccessPeer_init(int *arg0, CUdevice arg1, CUdevice arg2) { + load_library(); + return cuDeviceCanAccessPeer(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuDeviceCanAccessPeer_error(int *, CUdevice, CUdevice) { + return on_init_failed(194); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetP2PAttribute_init(int *arg0, CUdevice_P2PAttribute arg1, CUdevice arg2, CUdevice arg3) { + load_library(); + return cuDeviceGetP2PAttribute(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuDeviceGetP2PAttribute_error(int *, CUdevice_P2PAttribute, CUdevice, CUdevice) { + return on_init_failed(195); +} +static CUresult _WRAPLIB_API_CALL cuCtxEnablePeerAccess_init(CUcontext arg0, unsigned int arg1) { + load_library(); + return cuCtxEnablePeerAccess(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuCtxEnablePeerAccess_error(CUcontext, unsigned int) { + return on_init_failed(196); +} +static CUresult _WRAPLIB_API_CALL cuCtxDisablePeerAccess_init(CUcontext arg0) { + load_library(); + return cuCtxDisablePeerAccess(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxDisablePeerAccess_error(CUcontext) { + return on_init_failed(197); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsUnregisterResource_init(CUgraphicsResource arg0) { + load_library(); + return cuGraphicsUnregisterResource(arg0); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsUnregisterResource_error(CUgraphicsResource) { + return on_init_failed(198); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsSubResourceGetMappedArray_init(CUarray *arg0, CUgraphicsResource arg1, unsigned int arg2, unsigned int arg3) { + load_library(); + return cuGraphicsSubResourceGetMappedArray(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsSubResourceGetMappedArray_error(CUarray *, CUgraphicsResource, unsigned int, unsigned int) { + return on_init_failed(199); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedMipmappedArray_init(CUmipmappedArray *arg0, CUgraphicsResource arg1) { + load_library(); + return cuGraphicsResourceGetMappedMipmappedArray(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedMipmappedArray_error(CUmipmappedArray *, CUgraphicsResource) { + return on_init_failed(200); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedPointer_v2_init(CUdeviceptr *arg0, size_t *arg1, CUgraphicsResource arg2) { + load_library(); + return cuGraphicsResourceGetMappedPointer_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedPointer_v2_error(CUdeviceptr *, size_t *, CUgraphicsResource) { + return on_init_failed(201); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceSetMapFlags_v2_init(CUgraphicsResource arg0, unsigned int arg1) { + load_library(); + return cuGraphicsResourceSetMapFlags_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceSetMapFlags_v2_error(CUgraphicsResource, unsigned int) { + return on_init_failed(202); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsMapResources_ptsz_init(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + load_library(); + return cuGraphicsMapResources_ptsz(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsMapResources_ptsz_error(unsigned int, CUgraphicsResource *, CUstream) { + return on_init_failed(203); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsUnmapResources_ptsz_init(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + load_library(); + return cuGraphicsUnmapResources_ptsz(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsUnmapResources_ptsz_error(unsigned int, CUgraphicsResource *, CUstream) { + return on_init_failed(204); +} +static CUresult _WRAPLIB_API_CALL cuGetExportTable_init(const void **arg0, const CUuuid *arg1) { + load_library(); + return cuGetExportTable(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuGetExportTable_error(const void **, const CUuuid *) { + return on_init_failed(205); +} +static CUresult _WRAPLIB_API_CALL cuMemHostRegister_init(void *arg0, size_t arg1, unsigned int arg2) { + load_library(); + return cuMemHostRegister(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemHostRegister_error(void *, size_t, unsigned int) { + return on_init_failed(206); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceSetMapFlags_init(CUgraphicsResource arg0, unsigned int arg1) { + load_library(); + return cuGraphicsResourceSetMapFlags(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceSetMapFlags_error(CUgraphicsResource, unsigned int) { + return on_init_failed(207); +} +static CUresult _WRAPLIB_API_CALL cuLinkCreate_init(unsigned int arg0, CUjit_option *arg1, void **arg2, CUlinkState *arg3) { + load_library(); + return cuLinkCreate(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuLinkCreate_error(unsigned int, CUjit_option *, void **, CUlinkState *) { + return on_init_failed(208); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddData_init(CUlinkState arg0, CUjitInputType arg1, void *arg2, size_t arg3, const char *arg4, unsigned int arg5, CUjit_option *arg6, void **arg7) { + load_library(); + return cuLinkAddData(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddData_error(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **) { + return on_init_failed(209); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddFile_init(CUlinkState arg0, CUjitInputType arg1, const char *arg2, unsigned int arg3, CUjit_option *arg4, void **arg5) { + load_library(); + return cuLinkAddFile(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuLinkAddFile_error(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **) { + return on_init_failed(210); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_v2_init(CUtexref arg0, const CUDA_ARRAY_DESCRIPTOR *arg1, CUdeviceptr arg2, size_t arg3) { + load_library(); + return cuTexRefSetAddress2D_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_v2_error(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t) { + return on_init_failed(211); +} +static CUresult _WRAPLIB_API_CALL cuDeviceTotalMem_init(unsigned int *arg0, CUdevice arg1) { + load_library(); + return cuDeviceTotalMem(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuDeviceTotalMem_error(unsigned int *, CUdevice) { + return on_init_failed(212); +} +static CUresult _WRAPLIB_API_CALL cuCtxCreate_init(CUcontext *arg0, unsigned int arg1, CUdevice arg2) { + load_library(); + return cuCtxCreate(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuCtxCreate_error(CUcontext *, unsigned int, CUdevice) { + return on_init_failed(213); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetGlobal_init(CUdeviceptr_v1 *arg0, unsigned int *arg1, CUmodule arg2, const char *arg3) { + load_library(); + return cuModuleGetGlobal(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuModuleGetGlobal_error(CUdeviceptr_v1 *, unsigned int *, CUmodule, const char *) { + return on_init_failed(214); +} +static CUresult _WRAPLIB_API_CALL cuMemGetInfo_init(unsigned int *arg0, unsigned int *arg1) { + load_library(); + return cuMemGetInfo(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemGetInfo_error(unsigned int *, unsigned int *) { + return on_init_failed(215); +} +static CUresult _WRAPLIB_API_CALL cuMemAlloc_init(CUdeviceptr_v1 *arg0, unsigned int arg1) { + load_library(); + return cuMemAlloc(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemAlloc_error(CUdeviceptr_v1 *, unsigned int) { + return on_init_failed(216); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocPitch_init(CUdeviceptr_v1 *arg0, unsigned int *arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4) { + load_library(); + return cuMemAllocPitch(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocPitch_error(CUdeviceptr_v1 *, unsigned int *, unsigned int, unsigned int, unsigned int) { + return on_init_failed(217); +} +static CUresult _WRAPLIB_API_CALL cuMemFree_init(CUdeviceptr_v1 arg0) { + load_library(); + return cuMemFree(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemFree_error(CUdeviceptr_v1) { + return on_init_failed(218); +} +static CUresult _WRAPLIB_API_CALL cuMemGetAddressRange_init(CUdeviceptr_v1 *arg0, unsigned int *arg1, CUdeviceptr_v1 arg2) { + load_library(); + return cuMemGetAddressRange(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemGetAddressRange_error(CUdeviceptr_v1 *, unsigned int *, CUdeviceptr_v1) { + return on_init_failed(219); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocHost_init(void **arg0, unsigned int arg1) { + load_library(); + return cuMemAllocHost(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemAllocHost_error(void **, unsigned int) { + return on_init_failed(220); +} +static CUresult _WRAPLIB_API_CALL cuMemHostGetDevicePointer_init(CUdeviceptr_v1 *arg0, void *arg1, unsigned int arg2) { + load_library(); + return cuMemHostGetDevicePointer(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemHostGetDevicePointer_error(CUdeviceptr_v1 *, void *, unsigned int) { + return on_init_failed(221); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_init(CUdeviceptr_v1 arg0, const void *arg1, unsigned int arg2) { + load_library(); + return cuMemcpyHtoD(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_error(CUdeviceptr_v1, const void *, unsigned int) { + return on_init_failed(222); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_init(void *arg0, CUdeviceptr_v1 arg1, unsigned int arg2) { + load_library(); + return cuMemcpyDtoH(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_error(void *, CUdeviceptr_v1, unsigned int) { + return on_init_failed(223); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_init(CUdeviceptr_v1 arg0, CUdeviceptr_v1 arg1, unsigned int arg2) { + load_library(); + return cuMemcpyDtoD(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_error(CUdeviceptr_v1, CUdeviceptr_v1, unsigned int) { + return on_init_failed(224); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_init(CUarray arg0, unsigned int arg1, CUdeviceptr_v1 arg2, unsigned int arg3) { + load_library(); + return cuMemcpyDtoA(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_error(CUarray, unsigned int, CUdeviceptr_v1, unsigned int) { + return on_init_failed(225); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_init(CUdeviceptr_v1 arg0, CUarray arg1, unsigned int arg2, unsigned int arg3) { + load_library(); + return cuMemcpyAtoD(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_error(CUdeviceptr_v1, CUarray, unsigned int, unsigned int) { + return on_init_failed(226); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_init(CUarray arg0, unsigned int arg1, const void *arg2, unsigned int arg3) { + load_library(); + return cuMemcpyHtoA(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_error(CUarray, unsigned int, const void *, unsigned int) { + return on_init_failed(227); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_init(void *arg0, CUarray arg1, unsigned int arg2, unsigned int arg3) { + load_library(); + return cuMemcpyAtoH(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_error(void *, CUarray, unsigned int, unsigned int) { + return on_init_failed(228); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_init(CUarray arg0, unsigned int arg1, CUarray arg2, unsigned int arg3, unsigned int arg4) { + load_library(); + return cuMemcpyAtoA(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_error(CUarray, unsigned int, CUarray, unsigned int, unsigned int) { + return on_init_failed(229); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_init(CUarray arg0, unsigned int arg1, const void *arg2, unsigned int arg3, CUstream arg4) { + load_library(); + return cuMemcpyHtoAAsync(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_error(CUarray, unsigned int, const void *, unsigned int, CUstream) { + return on_init_failed(230); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_init(void *arg0, CUarray arg1, unsigned int arg2, unsigned int arg3, CUstream arg4) { + load_library(); + return cuMemcpyAtoHAsync(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_error(void *, CUarray, unsigned int, unsigned int, CUstream) { + return on_init_failed(231); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2D_init(const CUDA_MEMCPY2D_v1 *arg0) { + load_library(); + return cuMemcpy2D(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2D_error(const CUDA_MEMCPY2D_v1 *) { + return on_init_failed(232); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_init(const CUDA_MEMCPY2D_v1 *arg0) { + load_library(); + return cuMemcpy2DUnaligned(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_error(const CUDA_MEMCPY2D_v1 *) { + return on_init_failed(233); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3D_init(const CUDA_MEMCPY3D_v1 *arg0) { + load_library(); + return cuMemcpy3D(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3D_error(const CUDA_MEMCPY3D_v1 *) { + return on_init_failed(234); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_init(CUdeviceptr_v1 arg0, const void *arg1, unsigned int arg2, CUstream arg3) { + load_library(); + return cuMemcpyHtoDAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_error(CUdeviceptr_v1, const void *, unsigned int, CUstream) { + return on_init_failed(235); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_init(void *arg0, CUdeviceptr_v1 arg1, unsigned int arg2, CUstream arg3) { + load_library(); + return cuMemcpyDtoHAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_error(void *, CUdeviceptr_v1, unsigned int, CUstream) { + return on_init_failed(236); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_init(CUdeviceptr_v1 arg0, CUdeviceptr_v1 arg1, unsigned int arg2, CUstream arg3) { + load_library(); + return cuMemcpyDtoDAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_error(CUdeviceptr_v1, CUdeviceptr_v1, unsigned int, CUstream) { + return on_init_failed(237); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_init(const CUDA_MEMCPY2D_v1 *arg0, CUstream arg1) { + load_library(); + return cuMemcpy2DAsync(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_error(const CUDA_MEMCPY2D_v1 *, CUstream) { + return on_init_failed(238); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_init(const CUDA_MEMCPY3D_v1 *arg0, CUstream arg1) { + load_library(); + return cuMemcpy3DAsync(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_error(const CUDA_MEMCPY3D_v1 *, CUstream) { + return on_init_failed(239); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8_init(CUdeviceptr_v1 arg0, unsigned char arg1, unsigned int arg2) { + load_library(); + return cuMemsetD8(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8_error(CUdeviceptr_v1, unsigned char, unsigned int) { + return on_init_failed(240); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16_init(CUdeviceptr_v1 arg0, unsigned short arg1, unsigned int arg2) { + load_library(); + return cuMemsetD16(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16_error(CUdeviceptr_v1, unsigned short, unsigned int) { + return on_init_failed(241); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32_init(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned int arg2) { + load_library(); + return cuMemsetD32(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32_error(CUdeviceptr_v1, unsigned int, unsigned int) { + return on_init_failed(242); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8_init(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned char arg2, unsigned int arg3, unsigned int arg4) { + load_library(); + return cuMemsetD2D8(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8_error(CUdeviceptr_v1, unsigned int, unsigned char, unsigned int, unsigned int) { + return on_init_failed(243); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16_init(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned short arg2, unsigned int arg3, unsigned int arg4) { + load_library(); + return cuMemsetD2D16(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16_error(CUdeviceptr_v1, unsigned int, unsigned short, unsigned int, unsigned int) { + return on_init_failed(244); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32_init(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4) { + load_library(); + return cuMemsetD2D32(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32_error(CUdeviceptr_v1, unsigned int, unsigned int, unsigned int, unsigned int) { + return on_init_failed(245); +} +static CUresult _WRAPLIB_API_CALL cuArrayCreate_init(CUarray *arg0, const CUDA_ARRAY_DESCRIPTOR_v1 *arg1) { + load_library(); + return cuArrayCreate(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArrayCreate_error(CUarray *, const CUDA_ARRAY_DESCRIPTOR_v1 *) { + return on_init_failed(246); +} +static CUresult _WRAPLIB_API_CALL cuArrayGetDescriptor_init(CUDA_ARRAY_DESCRIPTOR_v1 *arg0, CUarray arg1) { + load_library(); + return cuArrayGetDescriptor(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArrayGetDescriptor_error(CUDA_ARRAY_DESCRIPTOR_v1 *, CUarray) { + return on_init_failed(247); +} +static CUresult _WRAPLIB_API_CALL cuArray3DCreate_init(CUarray *arg0, const CUDA_ARRAY3D_DESCRIPTOR_v1 *arg1) { + load_library(); + return cuArray3DCreate(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArray3DCreate_error(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR_v1 *) { + return on_init_failed(248); +} +static CUresult _WRAPLIB_API_CALL cuArray3DGetDescriptor_init(CUDA_ARRAY3D_DESCRIPTOR_v1 *arg0, CUarray arg1) { + load_library(); + return cuArray3DGetDescriptor(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuArray3DGetDescriptor_error(CUDA_ARRAY3D_DESCRIPTOR_v1 *, CUarray) { + return on_init_failed(249); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress_init(unsigned int *arg0, CUtexref arg1, CUdeviceptr_v1 arg2, unsigned int arg3) { + load_library(); + return cuTexRefSetAddress(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress_error(unsigned int *, CUtexref, CUdeviceptr_v1, unsigned int) { + return on_init_failed(250); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_init(CUtexref arg0, const CUDA_ARRAY_DESCRIPTOR_v1 *arg1, CUdeviceptr_v1 arg2, unsigned int arg3) { + load_library(); + return cuTexRefSetAddress2D(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_error(CUtexref, const CUDA_ARRAY_DESCRIPTOR_v1 *, CUdeviceptr_v1, unsigned int) { + return on_init_failed(251); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetAddress_init(CUdeviceptr_v1 *arg0, CUtexref arg1) { + load_library(); + return cuTexRefGetAddress(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuTexRefGetAddress_error(CUdeviceptr_v1 *, CUtexref) { + return on_init_failed(252); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedPointer_init(CUdeviceptr_v1 *arg0, unsigned int *arg1, CUgraphicsResource arg2) { + load_library(); + return cuGraphicsResourceGetMappedPointer(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedPointer_error(CUdeviceptr_v1 *, unsigned int *, CUgraphicsResource) { + return on_init_failed(253); +} +static CUresult _WRAPLIB_API_CALL cuCtxDestroy_init(CUcontext arg0) { + load_library(); + return cuCtxDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxDestroy_error(CUcontext) { + return on_init_failed(254); +} +static CUresult _WRAPLIB_API_CALL cuCtxPopCurrent_init(CUcontext *arg0) { + load_library(); + return cuCtxPopCurrent(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxPopCurrent_error(CUcontext *) { + return on_init_failed(255); +} +static CUresult _WRAPLIB_API_CALL cuCtxPushCurrent_init(CUcontext arg0) { + load_library(); + return cuCtxPushCurrent(arg0); +} +static CUresult _WRAPLIB_API_CALL cuCtxPushCurrent_error(CUcontext) { + return on_init_failed(256); +} +static CUresult _WRAPLIB_API_CALL cuStreamDestroy_init(CUstream arg0) { + load_library(); + return cuStreamDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuStreamDestroy_error(CUstream) { + return on_init_failed(257); +} +static CUresult _WRAPLIB_API_CALL cuEventDestroy_init(CUevent arg0) { + load_library(); + return cuEventDestroy(arg0); +} +static CUresult _WRAPLIB_API_CALL cuEventDestroy_error(CUevent) { + return on_init_failed(258); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_v2_init(CUdeviceptr arg0, const void *arg1, size_t arg2) { + load_library(); + return cuMemcpyHtoD_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_v2_error(CUdeviceptr, const void *, size_t) { + return on_init_failed(259); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_v2_init(void *arg0, CUdeviceptr arg1, size_t arg2) { + load_library(); + return cuMemcpyDtoH_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_v2_error(void *, CUdeviceptr, size_t) { + return on_init_failed(260); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_v2_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + load_library(); + return cuMemcpyDtoD_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_v2_error(CUdeviceptr, CUdeviceptr, size_t) { + return on_init_failed(261); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_v2_init(CUarray arg0, size_t arg1, CUdeviceptr arg2, size_t arg3) { + load_library(); + return cuMemcpyDtoA_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_v2_error(CUarray, size_t, CUdeviceptr, size_t) { + return on_init_failed(262); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_v2_init(CUdeviceptr arg0, CUarray arg1, size_t arg2, size_t arg3) { + load_library(); + return cuMemcpyAtoD_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_v2_error(CUdeviceptr, CUarray, size_t, size_t) { + return on_init_failed(263); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_v2_init(CUarray arg0, size_t arg1, const void *arg2, size_t arg3) { + load_library(); + return cuMemcpyHtoA_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_v2_error(CUarray, size_t, const void *, size_t) { + return on_init_failed(264); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_v2_init(void *arg0, CUarray arg1, size_t arg2, size_t arg3) { + load_library(); + return cuMemcpyAtoH_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_v2_error(void *, CUarray, size_t, size_t) { + return on_init_failed(265); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_v2_init(CUarray arg0, size_t arg1, CUarray arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemcpyAtoA_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_v2_error(CUarray, size_t, CUarray, size_t, size_t) { + return on_init_failed(266); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_v2_init(CUarray arg0, size_t arg1, const void *arg2, size_t arg3, CUstream arg4) { + load_library(); + return cuMemcpyHtoAAsync_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_v2_error(CUarray, size_t, const void *, size_t, CUstream) { + return on_init_failed(267); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_v2_init(void *arg0, CUarray arg1, size_t arg2, size_t arg3, CUstream arg4) { + load_library(); + return cuMemcpyAtoHAsync_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_v2_error(void *, CUarray, size_t, size_t, CUstream) { + return on_init_failed(268); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2D_v2_init(const CUDA_MEMCPY2D *arg0) { + load_library(); + return cuMemcpy2D_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2D_v2_error(const CUDA_MEMCPY2D *) { + return on_init_failed(269); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_v2_init(const CUDA_MEMCPY2D *arg0) { + load_library(); + return cuMemcpy2DUnaligned_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_v2_error(const CUDA_MEMCPY2D *) { + return on_init_failed(270); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3D_v2_init(const CUDA_MEMCPY3D *arg0) { + load_library(); + return cuMemcpy3D_v2(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3D_v2_error(const CUDA_MEMCPY3D *) { + return on_init_failed(271); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_v2_init(CUdeviceptr arg0, const void *arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyHtoDAsync_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_v2_error(CUdeviceptr, const void *, size_t, CUstream) { + return on_init_failed(272); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_v2_init(void *arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyDtoHAsync_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_v2_error(void *, CUdeviceptr, size_t, CUstream) { + return on_init_failed(273); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_v2_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyDtoDAsync_v2(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_v2_error(CUdeviceptr, CUdeviceptr, size_t, CUstream) { + return on_init_failed(274); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_v2_init(const CUDA_MEMCPY2D *arg0, CUstream arg1) { + load_library(); + return cuMemcpy2DAsync_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_v2_error(const CUDA_MEMCPY2D *, CUstream) { + return on_init_failed(275); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_v2_init(const CUDA_MEMCPY3D *arg0, CUstream arg1) { + load_library(); + return cuMemcpy3DAsync_v2(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_v2_error(const CUDA_MEMCPY3D *, CUstream) { + return on_init_failed(276); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8_v2_init(CUdeviceptr arg0, unsigned char arg1, size_t arg2) { + load_library(); + return cuMemsetD8_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8_v2_error(CUdeviceptr, unsigned char, size_t) { + return on_init_failed(277); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16_v2_init(CUdeviceptr arg0, unsigned short arg1, size_t arg2) { + load_library(); + return cuMemsetD16_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16_v2_error(CUdeviceptr, unsigned short, size_t) { + return on_init_failed(278); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32_v2_init(CUdeviceptr arg0, unsigned int arg1, size_t arg2) { + load_library(); + return cuMemsetD32_v2(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32_v2_error(CUdeviceptr, unsigned int, size_t) { + return on_init_failed(279); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8_v2_init(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemsetD2D8_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8_v2_error(CUdeviceptr, size_t, unsigned char, size_t, size_t) { + return on_init_failed(280); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16_v2_init(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemsetD2D16_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16_v2_error(CUdeviceptr, size_t, unsigned short, size_t, size_t) { + return on_init_failed(281); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32_v2_init(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4) { + load_library(); + return cuMemsetD2D32_v2(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32_v2_error(CUdeviceptr, size_t, unsigned int, size_t, size_t) { + return on_init_failed(282); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + load_library(); + return cuMemcpy(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy_error(CUdeviceptr, CUdeviceptr, size_t) { + return on_init_failed(283); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAsync_init(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemcpyAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyAsync_error(CUdeviceptr, CUdeviceptr, size_t, CUstream) { + return on_init_failed(284); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeer_init(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4) { + load_library(); + return cuMemcpyPeer(arg0, arg1, arg2, arg3, arg4); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeer_error(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t) { + return on_init_failed(285); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeerAsync_init(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemcpyPeerAsync(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemcpyPeerAsync_error(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream) { + return on_init_failed(286); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeer_init(const CUDA_MEMCPY3D_PEER *arg0) { + load_library(); + return cuMemcpy3DPeer(arg0); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeer_error(const CUDA_MEMCPY3D_PEER *) { + return on_init_failed(287); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeerAsync_init(const CUDA_MEMCPY3D_PEER *arg0, CUstream arg1) { + load_library(); + return cuMemcpy3DPeerAsync(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuMemcpy3DPeerAsync_error(const CUDA_MEMCPY3D_PEER *, CUstream) { + return on_init_failed(288); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8Async_init(CUdeviceptr arg0, unsigned char arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemsetD8Async(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD8Async_error(CUdeviceptr, unsigned char, size_t, CUstream) { + return on_init_failed(289); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16Async_init(CUdeviceptr arg0, unsigned short arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemsetD16Async(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD16Async_error(CUdeviceptr, unsigned short, size_t, CUstream) { + return on_init_failed(290); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32Async_init(CUdeviceptr arg0, unsigned int arg1, size_t arg2, CUstream arg3) { + load_library(); + return cuMemsetD32Async(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD32Async_error(CUdeviceptr, unsigned int, size_t, CUstream) { + return on_init_failed(291); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8Async_init(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemsetD2D8Async(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D8Async_error(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream) { + return on_init_failed(292); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16Async_init(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemsetD2D16Async(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D16Async_error(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream) { + return on_init_failed(293); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32Async_init(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4, CUstream arg5) { + load_library(); + return cuMemsetD2D32Async(arg0, arg1, arg2, arg3, arg4, arg5); +} +static CUresult _WRAPLIB_API_CALL cuMemsetD2D32Async_error(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream) { + return on_init_failed(294); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetPriority_init(CUstream arg0, int *arg1) { + load_library(); + return cuStreamGetPriority(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetPriority_error(CUstream, int *) { + return on_init_failed(295); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetFlags_init(CUstream arg0, unsigned int *arg1) { + load_library(); + return cuStreamGetFlags(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuStreamGetFlags_error(CUstream, unsigned int *) { + return on_init_failed(296); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitEvent_init(CUstream arg0, CUevent arg1, unsigned int arg2) { + load_library(); + return cuStreamWaitEvent(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitEvent_error(CUstream, CUevent, unsigned int) { + return on_init_failed(297); +} +static CUresult _WRAPLIB_API_CALL cuStreamAddCallback_init(CUstream arg0, CUstreamCallback arg1, void *arg2, unsigned int arg3) { + load_library(); + return cuStreamAddCallback(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamAddCallback_error(CUstream, CUstreamCallback, void *, unsigned int) { + return on_init_failed(298); +} +static CUresult _WRAPLIB_API_CALL cuStreamAttachMemAsync_init(CUstream arg0, CUdeviceptr arg1, size_t arg2, unsigned int arg3) { + load_library(); + return cuStreamAttachMemAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamAttachMemAsync_error(CUstream, CUdeviceptr, size_t, unsigned int) { + return on_init_failed(299); +} +static CUresult _WRAPLIB_API_CALL cuStreamQuery_init(CUstream arg0) { + load_library(); + return cuStreamQuery(arg0); +} +static CUresult _WRAPLIB_API_CALL cuStreamQuery_error(CUstream) { + return on_init_failed(300); +} +static CUresult _WRAPLIB_API_CALL cuStreamSynchronize_init(CUstream arg0) { + load_library(); + return cuStreamSynchronize(arg0); +} +static CUresult _WRAPLIB_API_CALL cuStreamSynchronize_error(CUstream) { + return on_init_failed(301); +} +static CUresult _WRAPLIB_API_CALL cuEventRecord_init(CUevent arg0, CUstream arg1) { + load_library(); + return cuEventRecord(arg0, arg1); +} +static CUresult _WRAPLIB_API_CALL cuEventRecord_error(CUevent, CUstream) { + return on_init_failed(302); +} +static CUresult _WRAPLIB_API_CALL cuLaunchKernel_init(CUfunction arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4, unsigned int arg5, unsigned int arg6, unsigned int arg7, CUstream arg8, void **arg9, void **arg10) { + load_library(); + return cuLaunchKernel(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); +} +static CUresult _WRAPLIB_API_CALL cuLaunchKernel_error(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **) { + return on_init_failed(303); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsMapResources_init(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + load_library(); + return cuGraphicsMapResources(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsMapResources_error(unsigned int, CUgraphicsResource *, CUstream) { + return on_init_failed(304); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsUnmapResources_init(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + load_library(); + return cuGraphicsUnmapResources(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuGraphicsUnmapResources_error(unsigned int, CUgraphicsResource *, CUstream) { + return on_init_failed(305); +} +static CUresult _WRAPLIB_API_CALL cuMemPrefetchAsync_init(CUdeviceptr arg0, size_t arg1, CUdevice arg2, CUstream arg3) { + load_library(); + return cuMemPrefetchAsync(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuMemPrefetchAsync_error(CUdeviceptr, size_t, CUdevice, CUstream) { + return on_init_failed(306); +} +static CUresult _WRAPLIB_API_CALL cuStreamWriteValue32_init(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + load_library(); + return cuStreamWriteValue32(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamWriteValue32_error(CUstream, CUdeviceptr, cuuint32_t, unsigned int) { + return on_init_failed(307); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitValue32_init(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + load_library(); + return cuStreamWaitValue32(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamWaitValue32_error(CUstream, CUdeviceptr, cuuint32_t, unsigned int) { + return on_init_failed(308); +} +static CUresult _WRAPLIB_API_CALL cuStreamBatchMemOp_init(CUstream arg0, unsigned int arg1, CUstreamBatchMemOpParams *arg2, unsigned int arg3) { + load_library(); + return cuStreamBatchMemOp(arg0, arg1, arg2, arg3); +} +static CUresult _WRAPLIB_API_CALL cuStreamBatchMemOp_error(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int) { + return on_init_failed(309); +} +static CUresult _WRAPLIB_API_CALL cuProfilerInitialize_init(const char *arg0, const char *arg1, CUoutput_mode arg2) { + load_library(); + return cuProfilerInitialize(arg0, arg1, arg2); +} +static CUresult _WRAPLIB_API_CALL cuProfilerInitialize_error(const char *, const char *, CUoutput_mode) { + return on_init_failed(310); +} +static CUresult _WRAPLIB_API_CALL cuProfilerStart_init() { + load_library(); + return cuProfilerStart(); +} +static CUresult _WRAPLIB_API_CALL cuProfilerStart_error() { + return on_init_failed(311); +} +static CUresult _WRAPLIB_API_CALL cuProfilerStop_init() { + load_library(); + return cuProfilerStop(); +} +static CUresult _WRAPLIB_API_CALL cuProfilerStop_error() { + return on_init_failed(312); +} +static constexpr size_t NR_FUNC = 313; +static void* g_func_table[NR_FUNC] = {(void*)(&cuGetErrorString_init), + (void*)(&cuGetErrorName_init), + (void*)(&cuInit_init), + (void*)(&cuDriverGetVersion_init), + (void*)(&cuDeviceGet_init), + (void*)(&cuDeviceGetCount_init), + (void*)(&cuDeviceGetName_init), + (void*)(&cuDeviceTotalMem_v2_init), + (void*)(&cuDeviceGetAttribute_init), + (void*)(&cuDeviceGetProperties_init), + (void*)(&cuDeviceComputeCapability_init), + (void*)(&cuDevicePrimaryCtxRetain_init), + (void*)(&cuDevicePrimaryCtxRelease_init), + (void*)(&cuDevicePrimaryCtxSetFlags_init), + (void*)(&cuDevicePrimaryCtxGetState_init), + (void*)(&cuDevicePrimaryCtxReset_init), + (void*)(&cuCtxCreate_v2_init), + (void*)(&cuCtxDestroy_v2_init), + (void*)(&cuCtxPushCurrent_v2_init), + (void*)(&cuCtxPopCurrent_v2_init), + (void*)(&cuCtxSetCurrent_init), + (void*)(&cuCtxGetCurrent_init), + (void*)(&cuCtxGetDevice_init), + (void*)(&cuCtxGetFlags_init), + (void*)(&cuCtxSynchronize_init), + (void*)(&cuCtxSetLimit_init), + (void*)(&cuCtxGetLimit_init), + (void*)(&cuCtxGetCacheConfig_init), + (void*)(&cuCtxSetCacheConfig_init), + (void*)(&cuCtxGetSharedMemConfig_init), + (void*)(&cuCtxSetSharedMemConfig_init), + (void*)(&cuCtxGetApiVersion_init), + (void*)(&cuCtxGetStreamPriorityRange_init), + (void*)(&cuCtxAttach_init), + (void*)(&cuCtxDetach_init), + (void*)(&cuModuleLoad_init), + (void*)(&cuModuleLoadData_init), + (void*)(&cuModuleLoadDataEx_init), + (void*)(&cuModuleLoadFatBinary_init), + (void*)(&cuModuleUnload_init), + (void*)(&cuModuleGetFunction_init), + (void*)(&cuModuleGetGlobal_v2_init), + (void*)(&cuModuleGetTexRef_init), + (void*)(&cuModuleGetSurfRef_init), + (void*)(&cuLinkCreate_v2_init), + (void*)(&cuLinkAddData_v2_init), + (void*)(&cuLinkAddFile_v2_init), + (void*)(&cuLinkComplete_init), + (void*)(&cuLinkDestroy_init), + (void*)(&cuMemGetInfo_v2_init), + (void*)(&cuMemAlloc_v2_init), + (void*)(&cuMemAllocPitch_v2_init), + (void*)(&cuMemFree_v2_init), + (void*)(&cuMemGetAddressRange_v2_init), + (void*)(&cuMemAllocHost_v2_init), + (void*)(&cuMemFreeHost_init), + (void*)(&cuMemHostAlloc_init), + (void*)(&cuMemHostGetDevicePointer_v2_init), + (void*)(&cuMemHostGetFlags_init), + (void*)(&cuMemAllocManaged_init), + (void*)(&cuDeviceGetByPCIBusId_init), + (void*)(&cuDeviceGetPCIBusId_init), + (void*)(&cuIpcGetEventHandle_init), + (void*)(&cuIpcOpenEventHandle_init), + (void*)(&cuIpcGetMemHandle_init), + (void*)(&cuIpcOpenMemHandle_init), + (void*)(&cuIpcCloseMemHandle_init), + (void*)(&cuMemHostRegister_v2_init), + (void*)(&cuMemHostUnregister_init), + (void*)(&cuMemcpy_ptds_init), + (void*)(&cuMemcpyPeer_ptds_init), + (void*)(&cuMemcpyHtoD_v2_ptds_init), + (void*)(&cuMemcpyDtoH_v2_ptds_init), + (void*)(&cuMemcpyDtoD_v2_ptds_init), + (void*)(&cuMemcpyDtoA_v2_ptds_init), + (void*)(&cuMemcpyAtoD_v2_ptds_init), + (void*)(&cuMemcpyHtoA_v2_ptds_init), + (void*)(&cuMemcpyAtoH_v2_ptds_init), + (void*)(&cuMemcpyAtoA_v2_ptds_init), + (void*)(&cuMemcpy2D_v2_ptds_init), + (void*)(&cuMemcpy2DUnaligned_v2_ptds_init), + (void*)(&cuMemcpy3D_v2_ptds_init), + (void*)(&cuMemcpy3DPeer_ptds_init), + (void*)(&cuMemcpyAsync_ptsz_init), + (void*)(&cuMemcpyPeerAsync_ptsz_init), + (void*)(&cuMemcpyHtoDAsync_v2_ptsz_init), + (void*)(&cuMemcpyDtoHAsync_v2_ptsz_init), + (void*)(&cuMemcpyDtoDAsync_v2_ptsz_init), + (void*)(&cuMemcpyHtoAAsync_v2_ptsz_init), + (void*)(&cuMemcpyAtoHAsync_v2_ptsz_init), + (void*)(&cuMemcpy2DAsync_v2_ptsz_init), + (void*)(&cuMemcpy3DAsync_v2_ptsz_init), + (void*)(&cuMemcpy3DPeerAsync_ptsz_init), + (void*)(&cuMemsetD8_v2_ptds_init), + (void*)(&cuMemsetD16_v2_ptds_init), + (void*)(&cuMemsetD32_v2_ptds_init), + (void*)(&cuMemsetD2D8_v2_ptds_init), + (void*)(&cuMemsetD2D16_v2_ptds_init), + (void*)(&cuMemsetD2D32_v2_ptds_init), + (void*)(&cuMemsetD8Async_ptsz_init), + (void*)(&cuMemsetD16Async_ptsz_init), + (void*)(&cuMemsetD32Async_ptsz_init), + (void*)(&cuMemsetD2D8Async_ptsz_init), + (void*)(&cuMemsetD2D16Async_ptsz_init), + (void*)(&cuMemsetD2D32Async_ptsz_init), + (void*)(&cuArrayCreate_v2_init), + (void*)(&cuArrayGetDescriptor_v2_init), + (void*)(&cuArrayDestroy_init), + (void*)(&cuArray3DCreate_v2_init), + (void*)(&cuArray3DGetDescriptor_v2_init), + (void*)(&cuMipmappedArrayCreate_init), + (void*)(&cuMipmappedArrayGetLevel_init), + (void*)(&cuMipmappedArrayDestroy_init), + (void*)(&cuPointerGetAttribute_init), + (void*)(&cuMemPrefetchAsync_ptsz_init), + (void*)(&cuMemAdvise_init), + (void*)(&cuMemRangeGetAttribute_init), + (void*)(&cuMemRangeGetAttributes_init), + (void*)(&cuPointerSetAttribute_init), + (void*)(&cuPointerGetAttributes_init), + (void*)(&cuStreamCreate_init), + (void*)(&cuStreamCreateWithPriority_init), + (void*)(&cuStreamGetPriority_ptsz_init), + (void*)(&cuStreamGetFlags_ptsz_init), + (void*)(&cuStreamWaitEvent_ptsz_init), + (void*)(&cuStreamAddCallback_ptsz_init), + (void*)(&cuStreamAttachMemAsync_ptsz_init), + (void*)(&cuStreamQuery_ptsz_init), + (void*)(&cuStreamSynchronize_ptsz_init), + (void*)(&cuStreamDestroy_v2_init), + (void*)(&cuEventCreate_init), + (void*)(&cuEventRecord_ptsz_init), + (void*)(&cuEventQuery_init), + (void*)(&cuEventSynchronize_init), + (void*)(&cuEventDestroy_v2_init), + (void*)(&cuEventElapsedTime_init), + (void*)(&cuStreamWaitValue32_ptsz_init), + (void*)(&cuStreamWriteValue32_ptsz_init), + (void*)(&cuStreamBatchMemOp_ptsz_init), + (void*)(&cuFuncGetAttribute_init), + (void*)(&cuFuncSetCacheConfig_init), + (void*)(&cuFuncSetSharedMemConfig_init), + (void*)(&cuLaunchKernel_ptsz_init), + (void*)(&cuFuncSetBlockShape_init), + (void*)(&cuFuncSetSharedSize_init), + (void*)(&cuParamSetSize_init), + (void*)(&cuParamSeti_init), + (void*)(&cuParamSetf_init), + (void*)(&cuParamSetv_init), + (void*)(&cuLaunch_init), + (void*)(&cuLaunchGrid_init), + (void*)(&cuLaunchGridAsync_init), + (void*)(&cuParamSetTexRef_init), + (void*)(&cuOccupancyMaxActiveBlocksPerMultiprocessor_init), + (void*)(&cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_init), + (void*)(&cuOccupancyMaxPotentialBlockSize_init), + (void*)(&cuOccupancyMaxPotentialBlockSizeWithFlags_init), + (void*)(&cuTexRefSetArray_init), + (void*)(&cuTexRefSetMipmappedArray_init), + (void*)(&cuTexRefSetAddress_v2_init), + (void*)(&cuTexRefSetAddress2D_v3_init), + (void*)(&cuTexRefSetFormat_init), + (void*)(&cuTexRefSetAddressMode_init), + (void*)(&cuTexRefSetFilterMode_init), + (void*)(&cuTexRefSetMipmapFilterMode_init), + (void*)(&cuTexRefSetMipmapLevelBias_init), + (void*)(&cuTexRefSetMipmapLevelClamp_init), + (void*)(&cuTexRefSetMaxAnisotropy_init), + (void*)(&cuTexRefSetBorderColor_init), + (void*)(&cuTexRefSetFlags_init), + (void*)(&cuTexRefGetAddress_v2_init), + (void*)(&cuTexRefGetArray_init), + (void*)(&cuTexRefGetMipmappedArray_init), + (void*)(&cuTexRefGetAddressMode_init), + (void*)(&cuTexRefGetFilterMode_init), + (void*)(&cuTexRefGetFormat_init), + (void*)(&cuTexRefGetMipmapFilterMode_init), + (void*)(&cuTexRefGetMipmapLevelBias_init), + (void*)(&cuTexRefGetMipmapLevelClamp_init), + (void*)(&cuTexRefGetMaxAnisotropy_init), + (void*)(&cuTexRefGetBorderColor_init), + (void*)(&cuTexRefGetFlags_init), + (void*)(&cuTexRefCreate_init), + (void*)(&cuTexRefDestroy_init), + (void*)(&cuSurfRefSetArray_init), + (void*)(&cuSurfRefGetArray_init), + (void*)(&cuTexObjectCreate_init), + (void*)(&cuTexObjectDestroy_init), + (void*)(&cuTexObjectGetResourceDesc_init), + (void*)(&cuTexObjectGetTextureDesc_init), + (void*)(&cuTexObjectGetResourceViewDesc_init), + (void*)(&cuSurfObjectCreate_init), + (void*)(&cuSurfObjectDestroy_init), + (void*)(&cuSurfObjectGetResourceDesc_init), + (void*)(&cuDeviceCanAccessPeer_init), + (void*)(&cuDeviceGetP2PAttribute_init), + (void*)(&cuCtxEnablePeerAccess_init), + (void*)(&cuCtxDisablePeerAccess_init), + (void*)(&cuGraphicsUnregisterResource_init), + (void*)(&cuGraphicsSubResourceGetMappedArray_init), + (void*)(&cuGraphicsResourceGetMappedMipmappedArray_init), + (void*)(&cuGraphicsResourceGetMappedPointer_v2_init), + (void*)(&cuGraphicsResourceSetMapFlags_v2_init), + (void*)(&cuGraphicsMapResources_ptsz_init), + (void*)(&cuGraphicsUnmapResources_ptsz_init), + (void*)(&cuGetExportTable_init), + (void*)(&cuMemHostRegister_init), + (void*)(&cuGraphicsResourceSetMapFlags_init), + (void*)(&cuLinkCreate_init), + (void*)(&cuLinkAddData_init), + (void*)(&cuLinkAddFile_init), + (void*)(&cuTexRefSetAddress2D_v2_init), + (void*)(&cuDeviceTotalMem_init), + (void*)(&cuCtxCreate_init), + (void*)(&cuModuleGetGlobal_init), + (void*)(&cuMemGetInfo_init), + (void*)(&cuMemAlloc_init), + (void*)(&cuMemAllocPitch_init), + (void*)(&cuMemFree_init), + (void*)(&cuMemGetAddressRange_init), + (void*)(&cuMemAllocHost_init), + (void*)(&cuMemHostGetDevicePointer_init), + (void*)(&cuMemcpyHtoD_init), + (void*)(&cuMemcpyDtoH_init), + (void*)(&cuMemcpyDtoD_init), + (void*)(&cuMemcpyDtoA_init), + (void*)(&cuMemcpyAtoD_init), + (void*)(&cuMemcpyHtoA_init), + (void*)(&cuMemcpyAtoH_init), + (void*)(&cuMemcpyAtoA_init), + (void*)(&cuMemcpyHtoAAsync_init), + (void*)(&cuMemcpyAtoHAsync_init), + (void*)(&cuMemcpy2D_init), + (void*)(&cuMemcpy2DUnaligned_init), + (void*)(&cuMemcpy3D_init), + (void*)(&cuMemcpyHtoDAsync_init), + (void*)(&cuMemcpyDtoHAsync_init), + (void*)(&cuMemcpyDtoDAsync_init), + (void*)(&cuMemcpy2DAsync_init), + (void*)(&cuMemcpy3DAsync_init), + (void*)(&cuMemsetD8_init), + (void*)(&cuMemsetD16_init), + (void*)(&cuMemsetD32_init), + (void*)(&cuMemsetD2D8_init), + (void*)(&cuMemsetD2D16_init), + (void*)(&cuMemsetD2D32_init), + (void*)(&cuArrayCreate_init), + (void*)(&cuArrayGetDescriptor_init), + (void*)(&cuArray3DCreate_init), + (void*)(&cuArray3DGetDescriptor_init), + (void*)(&cuTexRefSetAddress_init), + (void*)(&cuTexRefSetAddress2D_init), + (void*)(&cuTexRefGetAddress_init), + (void*)(&cuGraphicsResourceGetMappedPointer_init), + (void*)(&cuCtxDestroy_init), + (void*)(&cuCtxPopCurrent_init), + (void*)(&cuCtxPushCurrent_init), + (void*)(&cuStreamDestroy_init), + (void*)(&cuEventDestroy_init), + (void*)(&cuMemcpyHtoD_v2_init), + (void*)(&cuMemcpyDtoH_v2_init), + (void*)(&cuMemcpyDtoD_v2_init), + (void*)(&cuMemcpyDtoA_v2_init), + (void*)(&cuMemcpyAtoD_v2_init), + (void*)(&cuMemcpyHtoA_v2_init), + (void*)(&cuMemcpyAtoH_v2_init), + (void*)(&cuMemcpyAtoA_v2_init), + (void*)(&cuMemcpyHtoAAsync_v2_init), + (void*)(&cuMemcpyAtoHAsync_v2_init), + (void*)(&cuMemcpy2D_v2_init), + (void*)(&cuMemcpy2DUnaligned_v2_init), + (void*)(&cuMemcpy3D_v2_init), + (void*)(&cuMemcpyHtoDAsync_v2_init), + (void*)(&cuMemcpyDtoHAsync_v2_init), + (void*)(&cuMemcpyDtoDAsync_v2_init), + (void*)(&cuMemcpy2DAsync_v2_init), + (void*)(&cuMemcpy3DAsync_v2_init), + (void*)(&cuMemsetD8_v2_init), + (void*)(&cuMemsetD16_v2_init), + (void*)(&cuMemsetD32_v2_init), + (void*)(&cuMemsetD2D8_v2_init), + (void*)(&cuMemsetD2D16_v2_init), + (void*)(&cuMemsetD2D32_v2_init), + (void*)(&cuMemcpy_init), + (void*)(&cuMemcpyAsync_init), + (void*)(&cuMemcpyPeer_init), + (void*)(&cuMemcpyPeerAsync_init), + (void*)(&cuMemcpy3DPeer_init), + (void*)(&cuMemcpy3DPeerAsync_init), + (void*)(&cuMemsetD8Async_init), + (void*)(&cuMemsetD16Async_init), + (void*)(&cuMemsetD32Async_init), + (void*)(&cuMemsetD2D8Async_init), + (void*)(&cuMemsetD2D16Async_init), + (void*)(&cuMemsetD2D32Async_init), + (void*)(&cuStreamGetPriority_init), + (void*)(&cuStreamGetFlags_init), + (void*)(&cuStreamWaitEvent_init), + (void*)(&cuStreamAddCallback_init), + (void*)(&cuStreamAttachMemAsync_init), + (void*)(&cuStreamQuery_init), + (void*)(&cuStreamSynchronize_init), + (void*)(&cuEventRecord_init), + (void*)(&cuLaunchKernel_init), + (void*)(&cuGraphicsMapResources_init), + (void*)(&cuGraphicsUnmapResources_init), + (void*)(&cuMemPrefetchAsync_init), + (void*)(&cuStreamWriteValue32_init), + (void*)(&cuStreamWaitValue32_init), + (void*)(&cuStreamBatchMemOp_init), + (void*)(&cuProfilerInitialize_init), + (void*)(&cuProfilerStart_init), + (void*)(&cuProfilerStop_init)}; +static void* g_func_table_error[NR_FUNC] = {(void*)(&cuGetErrorString_error), + (void*)(&cuGetErrorName_error), + (void*)(&cuInit_error), + (void*)(&cuDriverGetVersion_error), + (void*)(&cuDeviceGet_error), + (void*)(&cuDeviceGetCount_error), + (void*)(&cuDeviceGetName_error), + (void*)(&cuDeviceTotalMem_v2_error), + (void*)(&cuDeviceGetAttribute_error), + (void*)(&cuDeviceGetProperties_error), + (void*)(&cuDeviceComputeCapability_error), + (void*)(&cuDevicePrimaryCtxRetain_error), + (void*)(&cuDevicePrimaryCtxRelease_error), + (void*)(&cuDevicePrimaryCtxSetFlags_error), + (void*)(&cuDevicePrimaryCtxGetState_error), + (void*)(&cuDevicePrimaryCtxReset_error), + (void*)(&cuCtxCreate_v2_error), + (void*)(&cuCtxDestroy_v2_error), + (void*)(&cuCtxPushCurrent_v2_error), + (void*)(&cuCtxPopCurrent_v2_error), + (void*)(&cuCtxSetCurrent_error), + (void*)(&cuCtxGetCurrent_error), + (void*)(&cuCtxGetDevice_error), + (void*)(&cuCtxGetFlags_error), + (void*)(&cuCtxSynchronize_error), + (void*)(&cuCtxSetLimit_error), + (void*)(&cuCtxGetLimit_error), + (void*)(&cuCtxGetCacheConfig_error), + (void*)(&cuCtxSetCacheConfig_error), + (void*)(&cuCtxGetSharedMemConfig_error), + (void*)(&cuCtxSetSharedMemConfig_error), + (void*)(&cuCtxGetApiVersion_error), + (void*)(&cuCtxGetStreamPriorityRange_error), + (void*)(&cuCtxAttach_error), + (void*)(&cuCtxDetach_error), + (void*)(&cuModuleLoad_error), + (void*)(&cuModuleLoadData_error), + (void*)(&cuModuleLoadDataEx_error), + (void*)(&cuModuleLoadFatBinary_error), + (void*)(&cuModuleUnload_error), + (void*)(&cuModuleGetFunction_error), + (void*)(&cuModuleGetGlobal_v2_error), + (void*)(&cuModuleGetTexRef_error), + (void*)(&cuModuleGetSurfRef_error), + (void*)(&cuLinkCreate_v2_error), + (void*)(&cuLinkAddData_v2_error), + (void*)(&cuLinkAddFile_v2_error), + (void*)(&cuLinkComplete_error), + (void*)(&cuLinkDestroy_error), + (void*)(&cuMemGetInfo_v2_error), + (void*)(&cuMemAlloc_v2_error), + (void*)(&cuMemAllocPitch_v2_error), + (void*)(&cuMemFree_v2_error), + (void*)(&cuMemGetAddressRange_v2_error), + (void*)(&cuMemAllocHost_v2_error), + (void*)(&cuMemFreeHost_error), + (void*)(&cuMemHostAlloc_error), + (void*)(&cuMemHostGetDevicePointer_v2_error), + (void*)(&cuMemHostGetFlags_error), + (void*)(&cuMemAllocManaged_error), + (void*)(&cuDeviceGetByPCIBusId_error), + (void*)(&cuDeviceGetPCIBusId_error), + (void*)(&cuIpcGetEventHandle_error), + (void*)(&cuIpcOpenEventHandle_error), + (void*)(&cuIpcGetMemHandle_error), + (void*)(&cuIpcOpenMemHandle_error), + (void*)(&cuIpcCloseMemHandle_error), + (void*)(&cuMemHostRegister_v2_error), + (void*)(&cuMemHostUnregister_error), + (void*)(&cuMemcpy_ptds_error), + (void*)(&cuMemcpyPeer_ptds_error), + (void*)(&cuMemcpyHtoD_v2_ptds_error), + (void*)(&cuMemcpyDtoH_v2_ptds_error), + (void*)(&cuMemcpyDtoD_v2_ptds_error), + (void*)(&cuMemcpyDtoA_v2_ptds_error), + (void*)(&cuMemcpyAtoD_v2_ptds_error), + (void*)(&cuMemcpyHtoA_v2_ptds_error), + (void*)(&cuMemcpyAtoH_v2_ptds_error), + (void*)(&cuMemcpyAtoA_v2_ptds_error), + (void*)(&cuMemcpy2D_v2_ptds_error), + (void*)(&cuMemcpy2DUnaligned_v2_ptds_error), + (void*)(&cuMemcpy3D_v2_ptds_error), + (void*)(&cuMemcpy3DPeer_ptds_error), + (void*)(&cuMemcpyAsync_ptsz_error), + (void*)(&cuMemcpyPeerAsync_ptsz_error), + (void*)(&cuMemcpyHtoDAsync_v2_ptsz_error), + (void*)(&cuMemcpyDtoHAsync_v2_ptsz_error), + (void*)(&cuMemcpyDtoDAsync_v2_ptsz_error), + (void*)(&cuMemcpyHtoAAsync_v2_ptsz_error), + (void*)(&cuMemcpyAtoHAsync_v2_ptsz_error), + (void*)(&cuMemcpy2DAsync_v2_ptsz_error), + (void*)(&cuMemcpy3DAsync_v2_ptsz_error), + (void*)(&cuMemcpy3DPeerAsync_ptsz_error), + (void*)(&cuMemsetD8_v2_ptds_error), + (void*)(&cuMemsetD16_v2_ptds_error), + (void*)(&cuMemsetD32_v2_ptds_error), + (void*)(&cuMemsetD2D8_v2_ptds_error), + (void*)(&cuMemsetD2D16_v2_ptds_error), + (void*)(&cuMemsetD2D32_v2_ptds_error), + (void*)(&cuMemsetD8Async_ptsz_error), + (void*)(&cuMemsetD16Async_ptsz_error), + (void*)(&cuMemsetD32Async_ptsz_error), + (void*)(&cuMemsetD2D8Async_ptsz_error), + (void*)(&cuMemsetD2D16Async_ptsz_error), + (void*)(&cuMemsetD2D32Async_ptsz_error), + (void*)(&cuArrayCreate_v2_error), + (void*)(&cuArrayGetDescriptor_v2_error), + (void*)(&cuArrayDestroy_error), + (void*)(&cuArray3DCreate_v2_error), + (void*)(&cuArray3DGetDescriptor_v2_error), + (void*)(&cuMipmappedArrayCreate_error), + (void*)(&cuMipmappedArrayGetLevel_error), + (void*)(&cuMipmappedArrayDestroy_error), + (void*)(&cuPointerGetAttribute_error), + (void*)(&cuMemPrefetchAsync_ptsz_error), + (void*)(&cuMemAdvise_error), + (void*)(&cuMemRangeGetAttribute_error), + (void*)(&cuMemRangeGetAttributes_error), + (void*)(&cuPointerSetAttribute_error), + (void*)(&cuPointerGetAttributes_error), + (void*)(&cuStreamCreate_error), + (void*)(&cuStreamCreateWithPriority_error), + (void*)(&cuStreamGetPriority_ptsz_error), + (void*)(&cuStreamGetFlags_ptsz_error), + (void*)(&cuStreamWaitEvent_ptsz_error), + (void*)(&cuStreamAddCallback_ptsz_error), + (void*)(&cuStreamAttachMemAsync_ptsz_error), + (void*)(&cuStreamQuery_ptsz_error), + (void*)(&cuStreamSynchronize_ptsz_error), + (void*)(&cuStreamDestroy_v2_error), + (void*)(&cuEventCreate_error), + (void*)(&cuEventRecord_ptsz_error), + (void*)(&cuEventQuery_error), + (void*)(&cuEventSynchronize_error), + (void*)(&cuEventDestroy_v2_error), + (void*)(&cuEventElapsedTime_error), + (void*)(&cuStreamWaitValue32_ptsz_error), + (void*)(&cuStreamWriteValue32_ptsz_error), + (void*)(&cuStreamBatchMemOp_ptsz_error), + (void*)(&cuFuncGetAttribute_error), + (void*)(&cuFuncSetCacheConfig_error), + (void*)(&cuFuncSetSharedMemConfig_error), + (void*)(&cuLaunchKernel_ptsz_error), + (void*)(&cuFuncSetBlockShape_error), + (void*)(&cuFuncSetSharedSize_error), + (void*)(&cuParamSetSize_error), + (void*)(&cuParamSeti_error), + (void*)(&cuParamSetf_error), + (void*)(&cuParamSetv_error), + (void*)(&cuLaunch_error), + (void*)(&cuLaunchGrid_error), + (void*)(&cuLaunchGridAsync_error), + (void*)(&cuParamSetTexRef_error), + (void*)(&cuOccupancyMaxActiveBlocksPerMultiprocessor_error), + (void*)(&cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_error), + (void*)(&cuOccupancyMaxPotentialBlockSize_error), + (void*)(&cuOccupancyMaxPotentialBlockSizeWithFlags_error), + (void*)(&cuTexRefSetArray_error), + (void*)(&cuTexRefSetMipmappedArray_error), + (void*)(&cuTexRefSetAddress_v2_error), + (void*)(&cuTexRefSetAddress2D_v3_error), + (void*)(&cuTexRefSetFormat_error), + (void*)(&cuTexRefSetAddressMode_error), + (void*)(&cuTexRefSetFilterMode_error), + (void*)(&cuTexRefSetMipmapFilterMode_error), + (void*)(&cuTexRefSetMipmapLevelBias_error), + (void*)(&cuTexRefSetMipmapLevelClamp_error), + (void*)(&cuTexRefSetMaxAnisotropy_error), + (void*)(&cuTexRefSetBorderColor_error), + (void*)(&cuTexRefSetFlags_error), + (void*)(&cuTexRefGetAddress_v2_error), + (void*)(&cuTexRefGetArray_error), + (void*)(&cuTexRefGetMipmappedArray_error), + (void*)(&cuTexRefGetAddressMode_error), + (void*)(&cuTexRefGetFilterMode_error), + (void*)(&cuTexRefGetFormat_error), + (void*)(&cuTexRefGetMipmapFilterMode_error), + (void*)(&cuTexRefGetMipmapLevelBias_error), + (void*)(&cuTexRefGetMipmapLevelClamp_error), + (void*)(&cuTexRefGetMaxAnisotropy_error), + (void*)(&cuTexRefGetBorderColor_error), + (void*)(&cuTexRefGetFlags_error), + (void*)(&cuTexRefCreate_error), + (void*)(&cuTexRefDestroy_error), + (void*)(&cuSurfRefSetArray_error), + (void*)(&cuSurfRefGetArray_error), + (void*)(&cuTexObjectCreate_error), + (void*)(&cuTexObjectDestroy_error), + (void*)(&cuTexObjectGetResourceDesc_error), + (void*)(&cuTexObjectGetTextureDesc_error), + (void*)(&cuTexObjectGetResourceViewDesc_error), + (void*)(&cuSurfObjectCreate_error), + (void*)(&cuSurfObjectDestroy_error), + (void*)(&cuSurfObjectGetResourceDesc_error), + (void*)(&cuDeviceCanAccessPeer_error), + (void*)(&cuDeviceGetP2PAttribute_error), + (void*)(&cuCtxEnablePeerAccess_error), + (void*)(&cuCtxDisablePeerAccess_error), + (void*)(&cuGraphicsUnregisterResource_error), + (void*)(&cuGraphicsSubResourceGetMappedArray_error), + (void*)(&cuGraphicsResourceGetMappedMipmappedArray_error), + (void*)(&cuGraphicsResourceGetMappedPointer_v2_error), + (void*)(&cuGraphicsResourceSetMapFlags_v2_error), + (void*)(&cuGraphicsMapResources_ptsz_error), + (void*)(&cuGraphicsUnmapResources_ptsz_error), + (void*)(&cuGetExportTable_error), + (void*)(&cuMemHostRegister_error), + (void*)(&cuGraphicsResourceSetMapFlags_error), + (void*)(&cuLinkCreate_error), + (void*)(&cuLinkAddData_error), + (void*)(&cuLinkAddFile_error), + (void*)(&cuTexRefSetAddress2D_v2_error), + (void*)(&cuDeviceTotalMem_error), + (void*)(&cuCtxCreate_error), + (void*)(&cuModuleGetGlobal_error), + (void*)(&cuMemGetInfo_error), + (void*)(&cuMemAlloc_error), + (void*)(&cuMemAllocPitch_error), + (void*)(&cuMemFree_error), + (void*)(&cuMemGetAddressRange_error), + (void*)(&cuMemAllocHost_error), + (void*)(&cuMemHostGetDevicePointer_error), + (void*)(&cuMemcpyHtoD_error), + (void*)(&cuMemcpyDtoH_error), + (void*)(&cuMemcpyDtoD_error), + (void*)(&cuMemcpyDtoA_error), + (void*)(&cuMemcpyAtoD_error), + (void*)(&cuMemcpyHtoA_error), + (void*)(&cuMemcpyAtoH_error), + (void*)(&cuMemcpyAtoA_error), + (void*)(&cuMemcpyHtoAAsync_error), + (void*)(&cuMemcpyAtoHAsync_error), + (void*)(&cuMemcpy2D_error), + (void*)(&cuMemcpy2DUnaligned_error), + (void*)(&cuMemcpy3D_error), + (void*)(&cuMemcpyHtoDAsync_error), + (void*)(&cuMemcpyDtoHAsync_error), + (void*)(&cuMemcpyDtoDAsync_error), + (void*)(&cuMemcpy2DAsync_error), + (void*)(&cuMemcpy3DAsync_error), + (void*)(&cuMemsetD8_error), + (void*)(&cuMemsetD16_error), + (void*)(&cuMemsetD32_error), + (void*)(&cuMemsetD2D8_error), + (void*)(&cuMemsetD2D16_error), + (void*)(&cuMemsetD2D32_error), + (void*)(&cuArrayCreate_error), + (void*)(&cuArrayGetDescriptor_error), + (void*)(&cuArray3DCreate_error), + (void*)(&cuArray3DGetDescriptor_error), + (void*)(&cuTexRefSetAddress_error), + (void*)(&cuTexRefSetAddress2D_error), + (void*)(&cuTexRefGetAddress_error), + (void*)(&cuGraphicsResourceGetMappedPointer_error), + (void*)(&cuCtxDestroy_error), + (void*)(&cuCtxPopCurrent_error), + (void*)(&cuCtxPushCurrent_error), + (void*)(&cuStreamDestroy_error), + (void*)(&cuEventDestroy_error), + (void*)(&cuMemcpyHtoD_v2_error), + (void*)(&cuMemcpyDtoH_v2_error), + (void*)(&cuMemcpyDtoD_v2_error), + (void*)(&cuMemcpyDtoA_v2_error), + (void*)(&cuMemcpyAtoD_v2_error), + (void*)(&cuMemcpyHtoA_v2_error), + (void*)(&cuMemcpyAtoH_v2_error), + (void*)(&cuMemcpyAtoA_v2_error), + (void*)(&cuMemcpyHtoAAsync_v2_error), + (void*)(&cuMemcpyAtoHAsync_v2_error), + (void*)(&cuMemcpy2D_v2_error), + (void*)(&cuMemcpy2DUnaligned_v2_error), + (void*)(&cuMemcpy3D_v2_error), + (void*)(&cuMemcpyHtoDAsync_v2_error), + (void*)(&cuMemcpyDtoHAsync_v2_error), + (void*)(&cuMemcpyDtoDAsync_v2_error), + (void*)(&cuMemcpy2DAsync_v2_error), + (void*)(&cuMemcpy3DAsync_v2_error), + (void*)(&cuMemsetD8_v2_error), + (void*)(&cuMemsetD16_v2_error), + (void*)(&cuMemsetD32_v2_error), + (void*)(&cuMemsetD2D8_v2_error), + (void*)(&cuMemsetD2D16_v2_error), + (void*)(&cuMemsetD2D32_v2_error), + (void*)(&cuMemcpy_error), + (void*)(&cuMemcpyAsync_error), + (void*)(&cuMemcpyPeer_error), + (void*)(&cuMemcpyPeerAsync_error), + (void*)(&cuMemcpy3DPeer_error), + (void*)(&cuMemcpy3DPeerAsync_error), + (void*)(&cuMemsetD8Async_error), + (void*)(&cuMemsetD16Async_error), + (void*)(&cuMemsetD32Async_error), + (void*)(&cuMemsetD2D8Async_error), + (void*)(&cuMemsetD2D16Async_error), + (void*)(&cuMemsetD2D32Async_error), + (void*)(&cuStreamGetPriority_error), + (void*)(&cuStreamGetFlags_error), + (void*)(&cuStreamWaitEvent_error), + (void*)(&cuStreamAddCallback_error), + (void*)(&cuStreamAttachMemAsync_error), + (void*)(&cuStreamQuery_error), + (void*)(&cuStreamSynchronize_error), + (void*)(&cuEventRecord_error), + (void*)(&cuLaunchKernel_error), + (void*)(&cuGraphicsMapResources_error), + (void*)(&cuGraphicsUnmapResources_error), + (void*)(&cuMemPrefetchAsync_error), + (void*)(&cuStreamWriteValue32_error), + (void*)(&cuStreamWaitValue32_error), + (void*)(&cuStreamBatchMemOp_error), + (void*)(&cuProfilerInitialize_error), + (void*)(&cuProfilerStart_error), + (void*)(&cuProfilerStop_error)}; +static const char* const g_func_name[NR_FUNC] = {"cuGetErrorString", + "cuGetErrorName", + "cuInit", + "cuDriverGetVersion", + "cuDeviceGet", + "cuDeviceGetCount", + "cuDeviceGetName", + "cuDeviceTotalMem_v2", + "cuDeviceGetAttribute", + "cuDeviceGetProperties", + "cuDeviceComputeCapability", + "cuDevicePrimaryCtxRetain", + "cuDevicePrimaryCtxRelease", + "cuDevicePrimaryCtxSetFlags", + "cuDevicePrimaryCtxGetState", + "cuDevicePrimaryCtxReset", + "cuCtxCreate_v2", + "cuCtxDestroy_v2", + "cuCtxPushCurrent_v2", + "cuCtxPopCurrent_v2", + "cuCtxSetCurrent", + "cuCtxGetCurrent", + "cuCtxGetDevice", + "cuCtxGetFlags", + "cuCtxSynchronize", + "cuCtxSetLimit", + "cuCtxGetLimit", + "cuCtxGetCacheConfig", + "cuCtxSetCacheConfig", + "cuCtxGetSharedMemConfig", + "cuCtxSetSharedMemConfig", + "cuCtxGetApiVersion", + "cuCtxGetStreamPriorityRange", + "cuCtxAttach", + "cuCtxDetach", + "cuModuleLoad", + "cuModuleLoadData", + "cuModuleLoadDataEx", + "cuModuleLoadFatBinary", + "cuModuleUnload", + "cuModuleGetFunction", + "cuModuleGetGlobal_v2", + "cuModuleGetTexRef", + "cuModuleGetSurfRef", + "cuLinkCreate_v2", + "cuLinkAddData_v2", + "cuLinkAddFile_v2", + "cuLinkComplete", + "cuLinkDestroy", + "cuMemGetInfo_v2", + "cuMemAlloc_v2", + "cuMemAllocPitch_v2", + "cuMemFree_v2", + "cuMemGetAddressRange_v2", + "cuMemAllocHost_v2", + "cuMemFreeHost", + "cuMemHostAlloc", + "cuMemHostGetDevicePointer_v2", + "cuMemHostGetFlags", + "cuMemAllocManaged", + "cuDeviceGetByPCIBusId", + "cuDeviceGetPCIBusId", + "cuIpcGetEventHandle", + "cuIpcOpenEventHandle", + "cuIpcGetMemHandle", + "cuIpcOpenMemHandle", + "cuIpcCloseMemHandle", + "cuMemHostRegister_v2", + "cuMemHostUnregister", + "cuMemcpy_ptds", + "cuMemcpyPeer_ptds", + "cuMemcpyHtoD_v2_ptds", + "cuMemcpyDtoH_v2_ptds", + "cuMemcpyDtoD_v2_ptds", + "cuMemcpyDtoA_v2_ptds", + "cuMemcpyAtoD_v2_ptds", + "cuMemcpyHtoA_v2_ptds", + "cuMemcpyAtoH_v2_ptds", + "cuMemcpyAtoA_v2_ptds", + "cuMemcpy2D_v2_ptds", + "cuMemcpy2DUnaligned_v2_ptds", + "cuMemcpy3D_v2_ptds", + "cuMemcpy3DPeer_ptds", + "cuMemcpyAsync_ptsz", + "cuMemcpyPeerAsync_ptsz", + "cuMemcpyHtoDAsync_v2_ptsz", + "cuMemcpyDtoHAsync_v2_ptsz", + "cuMemcpyDtoDAsync_v2_ptsz", + "cuMemcpyHtoAAsync_v2_ptsz", + "cuMemcpyAtoHAsync_v2_ptsz", + "cuMemcpy2DAsync_v2_ptsz", + "cuMemcpy3DAsync_v2_ptsz", + "cuMemcpy3DPeerAsync_ptsz", + "cuMemsetD8_v2_ptds", + "cuMemsetD16_v2_ptds", + "cuMemsetD32_v2_ptds", + "cuMemsetD2D8_v2_ptds", + "cuMemsetD2D16_v2_ptds", + "cuMemsetD2D32_v2_ptds", + "cuMemsetD8Async_ptsz", + "cuMemsetD16Async_ptsz", + "cuMemsetD32Async_ptsz", + "cuMemsetD2D8Async_ptsz", + "cuMemsetD2D16Async_ptsz", + "cuMemsetD2D32Async_ptsz", + "cuArrayCreate_v2", + "cuArrayGetDescriptor_v2", + "cuArrayDestroy", + "cuArray3DCreate_v2", + "cuArray3DGetDescriptor_v2", + "cuMipmappedArrayCreate", + "cuMipmappedArrayGetLevel", + "cuMipmappedArrayDestroy", + "cuPointerGetAttribute", + "cuMemPrefetchAsync_ptsz", + "cuMemAdvise", + "cuMemRangeGetAttribute", + "cuMemRangeGetAttributes", + "cuPointerSetAttribute", + "cuPointerGetAttributes", + "cuStreamCreate", + "cuStreamCreateWithPriority", + "cuStreamGetPriority_ptsz", + "cuStreamGetFlags_ptsz", + "cuStreamWaitEvent_ptsz", + "cuStreamAddCallback_ptsz", + "cuStreamAttachMemAsync_ptsz", + "cuStreamQuery_ptsz", + "cuStreamSynchronize_ptsz", + "cuStreamDestroy_v2", + "cuEventCreate", + "cuEventRecord_ptsz", + "cuEventQuery", + "cuEventSynchronize", + "cuEventDestroy_v2", + "cuEventElapsedTime", + "cuStreamWaitValue32_ptsz", + "cuStreamWriteValue32_ptsz", + "cuStreamBatchMemOp_ptsz", + "cuFuncGetAttribute", + "cuFuncSetCacheConfig", + "cuFuncSetSharedMemConfig", + "cuLaunchKernel_ptsz", + "cuFuncSetBlockShape", + "cuFuncSetSharedSize", + "cuParamSetSize", + "cuParamSeti", + "cuParamSetf", + "cuParamSetv", + "cuLaunch", + "cuLaunchGrid", + "cuLaunchGridAsync", + "cuParamSetTexRef", + "cuOccupancyMaxActiveBlocksPerMultiprocessor", + "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + "cuOccupancyMaxPotentialBlockSize", + "cuOccupancyMaxPotentialBlockSizeWithFlags", + "cuTexRefSetArray", + "cuTexRefSetMipmappedArray", + "cuTexRefSetAddress_v2", + "cuTexRefSetAddress2D_v3", + "cuTexRefSetFormat", + "cuTexRefSetAddressMode", + "cuTexRefSetFilterMode", + "cuTexRefSetMipmapFilterMode", + "cuTexRefSetMipmapLevelBias", + "cuTexRefSetMipmapLevelClamp", + "cuTexRefSetMaxAnisotropy", + "cuTexRefSetBorderColor", + "cuTexRefSetFlags", + "cuTexRefGetAddress_v2", + "cuTexRefGetArray", + "cuTexRefGetMipmappedArray", + "cuTexRefGetAddressMode", + "cuTexRefGetFilterMode", + "cuTexRefGetFormat", + "cuTexRefGetMipmapFilterMode", + "cuTexRefGetMipmapLevelBias", + "cuTexRefGetMipmapLevelClamp", + "cuTexRefGetMaxAnisotropy", + "cuTexRefGetBorderColor", + "cuTexRefGetFlags", + "cuTexRefCreate", + "cuTexRefDestroy", + "cuSurfRefSetArray", + "cuSurfRefGetArray", + "cuTexObjectCreate", + "cuTexObjectDestroy", + "cuTexObjectGetResourceDesc", + "cuTexObjectGetTextureDesc", + "cuTexObjectGetResourceViewDesc", + "cuSurfObjectCreate", + "cuSurfObjectDestroy", + "cuSurfObjectGetResourceDesc", + "cuDeviceCanAccessPeer", + "cuDeviceGetP2PAttribute", + "cuCtxEnablePeerAccess", + "cuCtxDisablePeerAccess", + "cuGraphicsUnregisterResource", + "cuGraphicsSubResourceGetMappedArray", + "cuGraphicsResourceGetMappedMipmappedArray", + "cuGraphicsResourceGetMappedPointer_v2", + "cuGraphicsResourceSetMapFlags_v2", + "cuGraphicsMapResources_ptsz", + "cuGraphicsUnmapResources_ptsz", + "cuGetExportTable", + "cuMemHostRegister", + "cuGraphicsResourceSetMapFlags", + "cuLinkCreate", + "cuLinkAddData", + "cuLinkAddFile", + "cuTexRefSetAddress2D_v2", + "cuDeviceTotalMem", + "cuCtxCreate", + "cuModuleGetGlobal", + "cuMemGetInfo", + "cuMemAlloc", + "cuMemAllocPitch", + "cuMemFree", + "cuMemGetAddressRange", + "cuMemAllocHost", + "cuMemHostGetDevicePointer", + "cuMemcpyHtoD", + "cuMemcpyDtoH", + "cuMemcpyDtoD", + "cuMemcpyDtoA", + "cuMemcpyAtoD", + "cuMemcpyHtoA", + "cuMemcpyAtoH", + "cuMemcpyAtoA", + "cuMemcpyHtoAAsync", + "cuMemcpyAtoHAsync", + "cuMemcpy2D", + "cuMemcpy2DUnaligned", + "cuMemcpy3D", + "cuMemcpyHtoDAsync", + "cuMemcpyDtoHAsync", + "cuMemcpyDtoDAsync", + "cuMemcpy2DAsync", + "cuMemcpy3DAsync", + "cuMemsetD8", + "cuMemsetD16", + "cuMemsetD32", + "cuMemsetD2D8", + "cuMemsetD2D16", + "cuMemsetD2D32", + "cuArrayCreate", + "cuArrayGetDescriptor", + "cuArray3DCreate", + "cuArray3DGetDescriptor", + "cuTexRefSetAddress", + "cuTexRefSetAddress2D", + "cuTexRefGetAddress", + "cuGraphicsResourceGetMappedPointer", + "cuCtxDestroy", + "cuCtxPopCurrent", + "cuCtxPushCurrent", + "cuStreamDestroy", + "cuEventDestroy", + "cuMemcpyHtoD_v2", + "cuMemcpyDtoH_v2", + "cuMemcpyDtoD_v2", + "cuMemcpyDtoA_v2", + "cuMemcpyAtoD_v2", + "cuMemcpyHtoA_v2", + "cuMemcpyAtoH_v2", + "cuMemcpyAtoA_v2", + "cuMemcpyHtoAAsync_v2", + "cuMemcpyAtoHAsync_v2", + "cuMemcpy2D_v2", + "cuMemcpy2DUnaligned_v2", + "cuMemcpy3D_v2", + "cuMemcpyHtoDAsync_v2", + "cuMemcpyDtoHAsync_v2", + "cuMemcpyDtoDAsync_v2", + "cuMemcpy2DAsync_v2", + "cuMemcpy3DAsync_v2", + "cuMemsetD8_v2", + "cuMemsetD16_v2", + "cuMemsetD32_v2", + "cuMemsetD2D8_v2", + "cuMemsetD2D16_v2", + "cuMemsetD2D32_v2", + "cuMemcpy", + "cuMemcpyAsync", + "cuMemcpyPeer", + "cuMemcpyPeerAsync", + "cuMemcpy3DPeer", + "cuMemcpy3DPeerAsync", + "cuMemsetD8Async", + "cuMemsetD16Async", + "cuMemsetD32Async", + "cuMemsetD2D8Async", + "cuMemsetD2D16Async", + "cuMemsetD2D32Async", + "cuStreamGetPriority", + "cuStreamGetFlags", + "cuStreamWaitEvent", + "cuStreamAddCallback", + "cuStreamAttachMemAsync", + "cuStreamQuery", + "cuStreamSynchronize", + "cuEventRecord", + "cuLaunchKernel", + "cuGraphicsMapResources", + "cuGraphicsUnmapResources", + "cuMemPrefetchAsync", + "cuStreamWriteValue32", + "cuStreamWaitValue32", + "cuStreamBatchMemOp", + "cuProfilerInitialize", + "cuProfilerStart", + "cuProfilerStop"}; + +static void load_library() { + static bool done = false; + static std::mutex mtx; + std::lock_guard lg{mtx}; + + if (done) + return; + + void* handle = get_library_handle(); + for (size_t i = 0; i < NR_FUNC; ++i) { + void* func; + if (!handle) { + func = nullptr; + } else { + func = resolve_library_func(handle, g_func_name[i]); + } + if (!func) { + func = g_func_table_error[i]; + } + __atomic_store_n(g_func_table + i, func, __ATOMIC_RELAXED); + } + done = true; +} + +CUresult _WRAPLIB_API_CALL cuGetErrorString(CUresult arg0, const char **arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUresult, const char **); + ON_ENTRY(cuGetErrorString); + f_ptr_t f = (f_ptr_t)(g_func_table[0]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuGetErrorName(CUresult arg0, const char **arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUresult, const char **); + ON_ENTRY(cuGetErrorName); + f_ptr_t f = (f_ptr_t)(g_func_table[1]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuInit(unsigned int arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int); + ON_ENTRY(cuInit); + f_ptr_t f = (f_ptr_t)(g_func_table[2]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuDriverGetVersion(int *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *); + ON_ENTRY(cuDriverGetVersion); + f_ptr_t f = (f_ptr_t)(g_func_table[3]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuDeviceGet(CUdevice *arg0, int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice *, int); + ON_ENTRY(cuDeviceGet); + f_ptr_t f = (f_ptr_t)(g_func_table[4]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetCount(int *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *); + ON_ENTRY(cuDeviceGetCount); + f_ptr_t f = (f_ptr_t)(g_func_table[5]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetName(char *arg0, int arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(char *, int, CUdevice); + ON_ENTRY(cuDeviceGetName); + f_ptr_t f = (f_ptr_t)(g_func_table[6]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuDeviceTotalMem_v2(size_t *arg0, CUdevice arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(size_t *, CUdevice); + ON_ENTRY(cuDeviceTotalMem_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[7]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetAttribute(int *arg0, CUdevice_attribute arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUdevice_attribute, CUdevice); + ON_ENTRY(cuDeviceGetAttribute); + f_ptr_t f = (f_ptr_t)(g_func_table[8]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetProperties(CUdevprop *arg0, CUdevice arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevprop *, CUdevice); + ON_ENTRY(cuDeviceGetProperties); + f_ptr_t f = (f_ptr_t)(g_func_table[9]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDeviceComputeCapability(int *arg0, int *arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, int *, CUdevice); + ON_ENTRY(cuDeviceComputeCapability); + f_ptr_t f = (f_ptr_t)(g_func_table[10]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxRetain(CUcontext *arg0, CUdevice arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *, CUdevice); + ON_ENTRY(cuDevicePrimaryCtxRetain); + f_ptr_t f = (f_ptr_t)(g_func_table[11]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxRelease(CUdevice arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice); + ON_ENTRY(cuDevicePrimaryCtxRelease); + f_ptr_t f = (f_ptr_t)(g_func_table[12]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxSetFlags(CUdevice arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice, unsigned int); + ON_ENTRY(cuDevicePrimaryCtxSetFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[13]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxGetState(CUdevice arg0, unsigned int *arg1, int *arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice, unsigned int *, int *); + ON_ENTRY(cuDevicePrimaryCtxGetState); + f_ptr_t f = (f_ptr_t)(g_func_table[14]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuDevicePrimaryCtxReset(CUdevice arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice); + ON_ENTRY(cuDevicePrimaryCtxReset); + f_ptr_t f = (f_ptr_t)(g_func_table[15]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxCreate_v2(CUcontext *arg0, unsigned int arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *, unsigned int, CUdevice); + ON_ENTRY(cuCtxCreate_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[16]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuCtxDestroy_v2(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxDestroy_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[17]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxPushCurrent_v2(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxPushCurrent_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[18]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxPopCurrent_v2(CUcontext *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *); + ON_ENTRY(cuCtxPopCurrent_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[19]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxSetCurrent(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxSetCurrent); + f_ptr_t f = (f_ptr_t)(g_func_table[20]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxGetCurrent(CUcontext *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *); + ON_ENTRY(cuCtxGetCurrent); + f_ptr_t f = (f_ptr_t)(g_func_table[21]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxGetDevice(CUdevice *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice *); + ON_ENTRY(cuCtxGetDevice); + f_ptr_t f = (f_ptr_t)(g_func_table[22]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxGetFlags(unsigned int *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int *); + ON_ENTRY(cuCtxGetFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[23]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxSynchronize() { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(); + ON_ENTRY(cuCtxSynchronize); + f_ptr_t f = (f_ptr_t)(g_func_table[24]); + return f(); +} +CUresult _WRAPLIB_API_CALL cuCtxSetLimit(CUlimit arg0, size_t arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlimit, size_t); + ON_ENTRY(cuCtxSetLimit); + f_ptr_t f = (f_ptr_t)(g_func_table[25]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxGetLimit(size_t *arg0, CUlimit arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(size_t *, CUlimit); + ON_ENTRY(cuCtxGetLimit); + f_ptr_t f = (f_ptr_t)(g_func_table[26]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxGetCacheConfig(CUfunc_cache *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunc_cache *); + ON_ENTRY(cuCtxGetCacheConfig); + f_ptr_t f = (f_ptr_t)(g_func_table[27]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxSetCacheConfig(CUfunc_cache arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunc_cache); + ON_ENTRY(cuCtxSetCacheConfig); + f_ptr_t f = (f_ptr_t)(g_func_table[28]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxGetSharedMemConfig(CUsharedconfig *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUsharedconfig *); + ON_ENTRY(cuCtxGetSharedMemConfig); + f_ptr_t f = (f_ptr_t)(g_func_table[29]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxSetSharedMemConfig(CUsharedconfig arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUsharedconfig); + ON_ENTRY(cuCtxSetSharedMemConfig); + f_ptr_t f = (f_ptr_t)(g_func_table[30]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxGetApiVersion(CUcontext arg0, unsigned int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext, unsigned int *); + ON_ENTRY(cuCtxGetApiVersion); + f_ptr_t f = (f_ptr_t)(g_func_table[31]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxGetStreamPriorityRange(int *arg0, int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, int *); + ON_ENTRY(cuCtxGetStreamPriorityRange); + f_ptr_t f = (f_ptr_t)(g_func_table[32]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxAttach(CUcontext *arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *, unsigned int); + ON_ENTRY(cuCtxAttach); + f_ptr_t f = (f_ptr_t)(g_func_table[33]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxDetach(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxDetach); + f_ptr_t f = (f_ptr_t)(g_func_table[34]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuModuleLoad(CUmodule *arg0, const char *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmodule *, const char *); + ON_ENTRY(cuModuleLoad); + f_ptr_t f = (f_ptr_t)(g_func_table[35]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuModuleLoadData(CUmodule *arg0, const void *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmodule *, const void *); + ON_ENTRY(cuModuleLoadData); + f_ptr_t f = (f_ptr_t)(g_func_table[36]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuModuleLoadDataEx(CUmodule *arg0, const void *arg1, unsigned int arg2, CUjit_option *arg3, void **arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmodule *, const void *, unsigned int, CUjit_option *, void **); + ON_ENTRY(cuModuleLoadDataEx); + f_ptr_t f = (f_ptr_t)(g_func_table[37]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuModuleLoadFatBinary(CUmodule *arg0, const void *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmodule *, const void *); + ON_ENTRY(cuModuleLoadFatBinary); + f_ptr_t f = (f_ptr_t)(g_func_table[38]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuModuleUnload(CUmodule arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmodule); + ON_ENTRY(cuModuleUnload); + f_ptr_t f = (f_ptr_t)(g_func_table[39]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuModuleGetFunction(CUfunction *arg0, CUmodule arg1, const char *arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction *, CUmodule, const char *); + ON_ENTRY(cuModuleGetFunction); + f_ptr_t f = (f_ptr_t)(g_func_table[40]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuModuleGetGlobal_v2(CUdeviceptr *arg0, size_t *arg1, CUmodule arg2, const char *arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, size_t *, CUmodule, const char *); + ON_ENTRY(cuModuleGetGlobal_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[41]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuModuleGetTexRef(CUtexref *arg0, CUmodule arg1, const char *arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref *, CUmodule, const char *); + ON_ENTRY(cuModuleGetTexRef); + f_ptr_t f = (f_ptr_t)(g_func_table[42]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuModuleGetSurfRef(CUsurfref *arg0, CUmodule arg1, const char *arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUsurfref *, CUmodule, const char *); + ON_ENTRY(cuModuleGetSurfRef); + f_ptr_t f = (f_ptr_t)(g_func_table[43]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuLinkCreate_v2(unsigned int arg0, CUjit_option *arg1, void **arg2, CUlinkState *arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUjit_option *, void **, CUlinkState *); + ON_ENTRY(cuLinkCreate_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[44]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuLinkAddData_v2(CUlinkState arg0, CUjitInputType arg1, void *arg2, size_t arg3, const char *arg4, unsigned int arg5, CUjit_option *arg6, void **arg7) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **); + ON_ENTRY(cuLinkAddData_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[45]); + return f(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7); +} +CUresult _WRAPLIB_API_CALL cuLinkAddFile_v2(CUlinkState arg0, CUjitInputType arg1, const char *arg2, unsigned int arg3, CUjit_option *arg4, void **arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **); + ON_ENTRY(cuLinkAddFile_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[46]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuLinkComplete(CUlinkState arg0, void **arg1, size_t *arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlinkState, void **, size_t *); + ON_ENTRY(cuLinkComplete); + f_ptr_t f = (f_ptr_t)(g_func_table[47]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuLinkDestroy(CUlinkState arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlinkState); + ON_ENTRY(cuLinkDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[48]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemGetInfo_v2(size_t *arg0, size_t *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(size_t *, size_t *); + ON_ENTRY(cuMemGetInfo_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[49]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemAlloc_v2(CUdeviceptr *arg0, size_t arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, size_t); + ON_ENTRY(cuMemAlloc_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[50]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemAllocPitch_v2(CUdeviceptr *arg0, size_t *arg1, size_t arg2, size_t arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, size_t *, size_t, size_t, unsigned int); + ON_ENTRY(cuMemAllocPitch_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[51]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemFree_v2(CUdeviceptr arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr); + ON_ENTRY(cuMemFree_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[52]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemGetAddressRange_v2(CUdeviceptr *arg0, size_t *arg1, CUdeviceptr arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, size_t *, CUdeviceptr); + ON_ENTRY(cuMemGetAddressRange_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[53]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemAllocHost_v2(void **arg0, size_t arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void **, size_t); + ON_ENTRY(cuMemAllocHost_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[54]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemFreeHost(void *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *); + ON_ENTRY(cuMemFreeHost); + f_ptr_t f = (f_ptr_t)(g_func_table[55]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemHostAlloc(void **arg0, size_t arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void **, size_t, unsigned int); + ON_ENTRY(cuMemHostAlloc); + f_ptr_t f = (f_ptr_t)(g_func_table[56]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemHostGetDevicePointer_v2(CUdeviceptr *arg0, void *arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, void *, unsigned int); + ON_ENTRY(cuMemHostGetDevicePointer_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[57]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemHostGetFlags(unsigned int *arg0, void *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int *, void *); + ON_ENTRY(cuMemHostGetFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[58]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemAllocManaged(CUdeviceptr *arg0, size_t arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, size_t, unsigned int); + ON_ENTRY(cuMemAllocManaged); + f_ptr_t f = (f_ptr_t)(g_func_table[59]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetByPCIBusId(CUdevice *arg0, const char *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdevice *, const char *); + ON_ENTRY(cuDeviceGetByPCIBusId); + f_ptr_t f = (f_ptr_t)(g_func_table[60]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetPCIBusId(char *arg0, int arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(char *, int, CUdevice); + ON_ENTRY(cuDeviceGetPCIBusId); + f_ptr_t f = (f_ptr_t)(g_func_table[61]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuIpcGetEventHandle(CUipcEventHandle *arg0, CUevent arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUipcEventHandle *, CUevent); + ON_ENTRY(cuIpcGetEventHandle); + f_ptr_t f = (f_ptr_t)(g_func_table[62]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuIpcOpenEventHandle(CUevent *arg0, CUipcEventHandle arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent *, CUipcEventHandle); + ON_ENTRY(cuIpcOpenEventHandle); + f_ptr_t f = (f_ptr_t)(g_func_table[63]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuIpcGetMemHandle(CUipcMemHandle *arg0, CUdeviceptr arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUipcMemHandle *, CUdeviceptr); + ON_ENTRY(cuIpcGetMemHandle); + f_ptr_t f = (f_ptr_t)(g_func_table[64]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuIpcOpenMemHandle(CUdeviceptr *arg0, CUipcMemHandle arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, CUipcMemHandle, unsigned int); + ON_ENTRY(cuIpcOpenMemHandle); + f_ptr_t f = (f_ptr_t)(g_func_table[65]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuIpcCloseMemHandle(CUdeviceptr arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr); + ON_ENTRY(cuIpcCloseMemHandle); + f_ptr_t f = (f_ptr_t)(g_func_table[66]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemHostRegister_v2(void *arg0, size_t arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, size_t, unsigned int); + ON_ENTRY(cuMemHostRegister_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[67]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemHostUnregister(void *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *); + ON_ENTRY(cuMemHostUnregister); + f_ptr_t f = (f_ptr_t)(g_func_table[68]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy_ptds(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpy_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[69]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyPeer_ptds(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); + ON_ENTRY(cuMemcpyPeer_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[70]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_v2_ptds(CUdeviceptr arg0, const void *arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, const void *, size_t); + ON_ENTRY(cuMemcpyHtoD_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[71]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_v2_ptds(void *arg0, CUdeviceptr arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpyDtoH_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[72]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_v2_ptds(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpyDtoD_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[73]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_v2_ptds(CUarray arg0, size_t arg1, CUdeviceptr arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpyDtoA_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[74]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_v2_ptds(CUdeviceptr arg0, CUarray arg1, size_t arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUarray, size_t, size_t); + ON_ENTRY(cuMemcpyAtoD_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[75]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_v2_ptds(CUarray arg0, size_t arg1, const void *arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, const void *, size_t); + ON_ENTRY(cuMemcpyHtoA_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[76]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_v2_ptds(void *arg0, CUarray arg1, size_t arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUarray, size_t, size_t); + ON_ENTRY(cuMemcpyAtoH_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[77]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_v2_ptds(CUarray arg0, size_t arg1, CUarray arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, CUarray, size_t, size_t); + ON_ENTRY(cuMemcpyAtoA_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[78]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2D_v2_ptds(const CUDA_MEMCPY2D *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D *); + ON_ENTRY(cuMemcpy2D_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[79]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_v2_ptds(const CUDA_MEMCPY2D *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D *); + ON_ENTRY(cuMemcpy2DUnaligned_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[80]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3D_v2_ptds(const CUDA_MEMCPY3D *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D *); + ON_ENTRY(cuMemcpy3D_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[81]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DPeer_ptds(const CUDA_MEMCPY3D_PEER *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D_PEER *); + ON_ENTRY(cuMemcpy3DPeer_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[82]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAsync_ptsz(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t, CUstream); + ON_ENTRY(cuMemcpyAsync_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[83]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyPeerAsync_ptsz(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); + ON_ENTRY(cuMemcpyPeerAsync_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[84]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_v2_ptsz(CUdeviceptr arg0, const void *arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, const void *, size_t, CUstream); + ON_ENTRY(cuMemcpyHtoDAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[85]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_v2_ptsz(void *arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUdeviceptr, size_t, CUstream); + ON_ENTRY(cuMemcpyDtoHAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[86]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_v2_ptsz(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t, CUstream); + ON_ENTRY(cuMemcpyDtoDAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[87]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_v2_ptsz(CUarray arg0, size_t arg1, const void *arg2, size_t arg3, CUstream arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, const void *, size_t, CUstream); + ON_ENTRY(cuMemcpyHtoAAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[88]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_v2_ptsz(void *arg0, CUarray arg1, size_t arg2, size_t arg3, CUstream arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUarray, size_t, size_t, CUstream); + ON_ENTRY(cuMemcpyAtoHAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[89]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_v2_ptsz(const CUDA_MEMCPY2D *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D *, CUstream); + ON_ENTRY(cuMemcpy2DAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[90]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_v2_ptsz(const CUDA_MEMCPY3D *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D *, CUstream); + ON_ENTRY(cuMemcpy3DAsync_v2_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[91]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DPeerAsync_ptsz(const CUDA_MEMCPY3D_PEER *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D_PEER *, CUstream); + ON_ENTRY(cuMemcpy3DPeerAsync_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[92]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemsetD8_v2_ptds(CUdeviceptr arg0, unsigned char arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned char, size_t); + ON_ENTRY(cuMemsetD8_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[93]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD16_v2_ptds(CUdeviceptr arg0, unsigned short arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned short, size_t); + ON_ENTRY(cuMemsetD16_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[94]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD32_v2_ptds(CUdeviceptr arg0, unsigned int arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned int, size_t); + ON_ENTRY(cuMemsetD32_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[95]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D8_v2_ptds(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned char, size_t, size_t); + ON_ENTRY(cuMemsetD2D8_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[96]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D16_v2_ptds(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned short, size_t, size_t); + ON_ENTRY(cuMemsetD2D16_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[97]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D32_v2_ptds(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned int, size_t, size_t); + ON_ENTRY(cuMemsetD2D32_v2_ptds); + f_ptr_t f = (f_ptr_t)(g_func_table[98]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD8Async_ptsz(CUdeviceptr arg0, unsigned char arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned char, size_t, CUstream); + ON_ENTRY(cuMemsetD8Async_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[99]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemsetD16Async_ptsz(CUdeviceptr arg0, unsigned short arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned short, size_t, CUstream); + ON_ENTRY(cuMemsetD16Async_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[100]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemsetD32Async_ptsz(CUdeviceptr arg0, unsigned int arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned int, size_t, CUstream); + ON_ENTRY(cuMemsetD32Async_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[101]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D8Async_ptsz(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream); + ON_ENTRY(cuMemsetD2D8Async_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[102]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D16Async_ptsz(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream); + ON_ENTRY(cuMemsetD2D16Async_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[103]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D32Async_ptsz(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream); + ON_ENTRY(cuMemsetD2D32Async_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[104]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuArrayCreate_v2(CUarray *arg0, const CUDA_ARRAY_DESCRIPTOR *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *); + ON_ENTRY(cuArrayCreate_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[105]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuArrayGetDescriptor_v2(CUDA_ARRAY_DESCRIPTOR *arg0, CUarray arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_ARRAY_DESCRIPTOR *, CUarray); + ON_ENTRY(cuArrayGetDescriptor_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[106]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuArrayDestroy(CUarray arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray); + ON_ENTRY(cuArrayDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[107]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuArray3DCreate_v2(CUarray *arg0, const CUDA_ARRAY3D_DESCRIPTOR *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *); + ON_ENTRY(cuArray3DCreate_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[108]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuArray3DGetDescriptor_v2(CUDA_ARRAY3D_DESCRIPTOR *arg0, CUarray arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray); + ON_ENTRY(cuArray3DGetDescriptor_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[109]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMipmappedArrayCreate(CUmipmappedArray *arg0, const CUDA_ARRAY3D_DESCRIPTOR *arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int); + ON_ENTRY(cuMipmappedArrayCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[110]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMipmappedArrayGetLevel(CUarray *arg0, CUmipmappedArray arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, CUmipmappedArray, unsigned int); + ON_ENTRY(cuMipmappedArrayGetLevel); + f_ptr_t f = (f_ptr_t)(g_func_table[111]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMipmappedArrayDestroy(CUmipmappedArray arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmipmappedArray); + ON_ENTRY(cuMipmappedArrayDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[112]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuPointerGetAttribute(void *arg0, CUpointer_attribute arg1, CUdeviceptr arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUpointer_attribute, CUdeviceptr); + ON_ENTRY(cuPointerGetAttribute); + f_ptr_t f = (f_ptr_t)(g_func_table[113]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemPrefetchAsync_ptsz(CUdeviceptr arg0, size_t arg1, CUdevice arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, CUdevice, CUstream); + ON_ENTRY(cuMemPrefetchAsync_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[114]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemAdvise(CUdeviceptr arg0, size_t arg1, CUmem_advise arg2, CUdevice arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, CUmem_advise, CUdevice); + ON_ENTRY(cuMemAdvise); + f_ptr_t f = (f_ptr_t)(g_func_table[115]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemRangeGetAttribute(void *arg0, size_t arg1, CUmem_range_attribute arg2, CUdeviceptr arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, size_t, CUmem_range_attribute, CUdeviceptr, size_t); + ON_ENTRY(cuMemRangeGetAttribute); + f_ptr_t f = (f_ptr_t)(g_func_table[116]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemRangeGetAttributes(void **arg0, size_t *arg1, CUmem_range_attribute *arg2, size_t arg3, CUdeviceptr arg4, size_t arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t); + ON_ENTRY(cuMemRangeGetAttributes); + f_ptr_t f = (f_ptr_t)(g_func_table[117]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuPointerSetAttribute(const void *arg0, CUpointer_attribute arg1, CUdeviceptr arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const void *, CUpointer_attribute, CUdeviceptr); + ON_ENTRY(cuPointerSetAttribute); + f_ptr_t f = (f_ptr_t)(g_func_table[118]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuPointerGetAttributes(unsigned int arg0, CUpointer_attribute *arg1, void **arg2, CUdeviceptr arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr); + ON_ENTRY(cuPointerGetAttributes); + f_ptr_t f = (f_ptr_t)(g_func_table[119]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamCreate(CUstream *arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream *, unsigned int); + ON_ENTRY(cuStreamCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[120]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuStreamCreateWithPriority(CUstream *arg0, unsigned int arg1, int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream *, unsigned int, int); + ON_ENTRY(cuStreamCreateWithPriority); + f_ptr_t f = (f_ptr_t)(g_func_table[121]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuStreamGetPriority_ptsz(CUstream arg0, int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, int *); + ON_ENTRY(cuStreamGetPriority_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[122]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuStreamGetFlags_ptsz(CUstream arg0, unsigned int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, unsigned int *); + ON_ENTRY(cuStreamGetFlags_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[123]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuStreamWaitEvent_ptsz(CUstream arg0, CUevent arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUevent, unsigned int); + ON_ENTRY(cuStreamWaitEvent_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[124]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuStreamAddCallback_ptsz(CUstream arg0, CUstreamCallback arg1, void *arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUstreamCallback, void *, unsigned int); + ON_ENTRY(cuStreamAddCallback_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[125]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamAttachMemAsync_ptsz(CUstream arg0, CUdeviceptr arg1, size_t arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUdeviceptr, size_t, unsigned int); + ON_ENTRY(cuStreamAttachMemAsync_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[126]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamQuery_ptsz(CUstream arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream); + ON_ENTRY(cuStreamQuery_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[127]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuStreamSynchronize_ptsz(CUstream arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream); + ON_ENTRY(cuStreamSynchronize_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[128]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuStreamDestroy_v2(CUstream arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream); + ON_ENTRY(cuStreamDestroy_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[129]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuEventCreate(CUevent *arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent *, unsigned int); + ON_ENTRY(cuEventCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[130]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuEventRecord_ptsz(CUevent arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent, CUstream); + ON_ENTRY(cuEventRecord_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[131]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuEventQuery(CUevent arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent); + ON_ENTRY(cuEventQuery); + f_ptr_t f = (f_ptr_t)(g_func_table[132]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuEventSynchronize(CUevent arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent); + ON_ENTRY(cuEventSynchronize); + f_ptr_t f = (f_ptr_t)(g_func_table[133]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuEventDestroy_v2(CUevent arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent); + ON_ENTRY(cuEventDestroy_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[134]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuEventElapsedTime(float *arg0, CUevent arg1, CUevent arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(float *, CUevent, CUevent); + ON_ENTRY(cuEventElapsedTime); + f_ptr_t f = (f_ptr_t)(g_func_table[135]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuStreamWaitValue32_ptsz(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + ON_ENTRY(cuStreamWaitValue32_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[136]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamWriteValue32_ptsz(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + ON_ENTRY(cuStreamWriteValue32_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[137]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamBatchMemOp_ptsz(CUstream arg0, unsigned int arg1, CUstreamBatchMemOpParams *arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int); + ON_ENTRY(cuStreamBatchMemOp_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[138]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuFuncGetAttribute(int *arg0, CUfunction_attribute arg1, CUfunction arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUfunction_attribute, CUfunction); + ON_ENTRY(cuFuncGetAttribute); + f_ptr_t f = (f_ptr_t)(g_func_table[139]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuFuncSetCacheConfig(CUfunction arg0, CUfunc_cache arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, CUfunc_cache); + ON_ENTRY(cuFuncSetCacheConfig); + f_ptr_t f = (f_ptr_t)(g_func_table[140]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuFuncSetSharedMemConfig(CUfunction arg0, CUsharedconfig arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, CUsharedconfig); + ON_ENTRY(cuFuncSetSharedMemConfig); + f_ptr_t f = (f_ptr_t)(g_func_table[141]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuLaunchKernel_ptsz(CUfunction arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4, unsigned int arg5, unsigned int arg6, unsigned int arg7, CUstream arg8, void **arg9, void **arg10) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **); + ON_ENTRY(cuLaunchKernel_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[142]); + return f(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); +} +CUresult _WRAPLIB_API_CALL cuFuncSetBlockShape(CUfunction arg0, int arg1, int arg2, int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, int, int); + ON_ENTRY(cuFuncSetBlockShape); + f_ptr_t f = (f_ptr_t)(g_func_table[143]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuFuncSetSharedSize(CUfunction arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, unsigned int); + ON_ENTRY(cuFuncSetSharedSize); + f_ptr_t f = (f_ptr_t)(g_func_table[144]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuParamSetSize(CUfunction arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, unsigned int); + ON_ENTRY(cuParamSetSize); + f_ptr_t f = (f_ptr_t)(g_func_table[145]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuParamSeti(CUfunction arg0, int arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, unsigned int); + ON_ENTRY(cuParamSeti); + f_ptr_t f = (f_ptr_t)(g_func_table[146]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuParamSetf(CUfunction arg0, int arg1, float arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, float); + ON_ENTRY(cuParamSetf); + f_ptr_t f = (f_ptr_t)(g_func_table[147]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuParamSetv(CUfunction arg0, int arg1, void *arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, void *, unsigned int); + ON_ENTRY(cuParamSetv); + f_ptr_t f = (f_ptr_t)(g_func_table[148]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuLaunch(CUfunction arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction); + ON_ENTRY(cuLaunch); + f_ptr_t f = (f_ptr_t)(g_func_table[149]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuLaunchGrid(CUfunction arg0, int arg1, int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, int); + ON_ENTRY(cuLaunchGrid); + f_ptr_t f = (f_ptr_t)(g_func_table[150]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuLaunchGridAsync(CUfunction arg0, int arg1, int arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, int, CUstream); + ON_ENTRY(cuLaunchGridAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[151]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuParamSetTexRef(CUfunction arg0, int arg1, CUtexref arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, int, CUtexref); + ON_ENTRY(cuParamSetTexRef); + f_ptr_t f = (f_ptr_t)(g_func_table[152]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuOccupancyMaxActiveBlocksPerMultiprocessor(int *arg0, CUfunction arg1, int arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUfunction, int, size_t); + ON_ENTRY(cuOccupancyMaxActiveBlocksPerMultiprocessor); + f_ptr_t f = (f_ptr_t)(g_func_table[153]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *arg0, CUfunction arg1, int arg2, size_t arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUfunction, int, size_t, unsigned int); + ON_ENTRY(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[154]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuOccupancyMaxPotentialBlockSize(int *arg0, int *arg1, CUfunction arg2, CUoccupancyB2DSize arg3, size_t arg4, int arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int); + ON_ENTRY(cuOccupancyMaxPotentialBlockSize); + f_ptr_t f = (f_ptr_t)(g_func_table[155]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuOccupancyMaxPotentialBlockSizeWithFlags(int *arg0, int *arg1, CUfunction arg2, CUoccupancyB2DSize arg3, size_t arg4, int arg5, unsigned int arg6) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int); + ON_ENTRY(cuOccupancyMaxPotentialBlockSizeWithFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[156]); + return f(arg0, arg1, arg2, arg3, arg4, arg5, arg6); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetArray(CUtexref arg0, CUarray arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, CUarray, unsigned int); + ON_ENTRY(cuTexRefSetArray); + f_ptr_t f = (f_ptr_t)(g_func_table[157]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetMipmappedArray(CUtexref arg0, CUmipmappedArray arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, CUmipmappedArray, unsigned int); + ON_ENTRY(cuTexRefSetMipmappedArray); + f_ptr_t f = (f_ptr_t)(g_func_table[158]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetAddress_v2(size_t *arg0, CUtexref arg1, CUdeviceptr arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(size_t *, CUtexref, CUdeviceptr, size_t); + ON_ENTRY(cuTexRefSetAddress_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[159]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_v3(CUtexref arg0, const CUDA_ARRAY_DESCRIPTOR *arg1, CUdeviceptr arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t); + ON_ENTRY(cuTexRefSetAddress2D_v3); + f_ptr_t f = (f_ptr_t)(g_func_table[160]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetFormat(CUtexref arg0, CUarray_format arg1, int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, CUarray_format, int); + ON_ENTRY(cuTexRefSetFormat); + f_ptr_t f = (f_ptr_t)(g_func_table[161]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetAddressMode(CUtexref arg0, int arg1, CUaddress_mode arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, int, CUaddress_mode); + ON_ENTRY(cuTexRefSetAddressMode); + f_ptr_t f = (f_ptr_t)(g_func_table[162]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetFilterMode(CUtexref arg0, CUfilter_mode arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, CUfilter_mode); + ON_ENTRY(cuTexRefSetFilterMode); + f_ptr_t f = (f_ptr_t)(g_func_table[163]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapFilterMode(CUtexref arg0, CUfilter_mode arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, CUfilter_mode); + ON_ENTRY(cuTexRefSetMipmapFilterMode); + f_ptr_t f = (f_ptr_t)(g_func_table[164]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapLevelBias(CUtexref arg0, float arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, float); + ON_ENTRY(cuTexRefSetMipmapLevelBias); + f_ptr_t f = (f_ptr_t)(g_func_table[165]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetMipmapLevelClamp(CUtexref arg0, float arg1, float arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, float, float); + ON_ENTRY(cuTexRefSetMipmapLevelClamp); + f_ptr_t f = (f_ptr_t)(g_func_table[166]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetMaxAnisotropy(CUtexref arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, unsigned int); + ON_ENTRY(cuTexRefSetMaxAnisotropy); + f_ptr_t f = (f_ptr_t)(g_func_table[167]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetBorderColor(CUtexref arg0, float *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, float *); + ON_ENTRY(cuTexRefSetBorderColor); + f_ptr_t f = (f_ptr_t)(g_func_table[168]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetFlags(CUtexref arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, unsigned int); + ON_ENTRY(cuTexRefSetFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[169]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetAddress_v2(CUdeviceptr *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, CUtexref); + ON_ENTRY(cuTexRefGetAddress_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[170]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetArray(CUarray *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, CUtexref); + ON_ENTRY(cuTexRefGetArray); + f_ptr_t f = (f_ptr_t)(g_func_table[171]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetMipmappedArray(CUmipmappedArray *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmipmappedArray *, CUtexref); + ON_ENTRY(cuTexRefGetMipmappedArray); + f_ptr_t f = (f_ptr_t)(g_func_table[172]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetAddressMode(CUaddress_mode *arg0, CUtexref arg1, int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUaddress_mode *, CUtexref, int); + ON_ENTRY(cuTexRefGetAddressMode); + f_ptr_t f = (f_ptr_t)(g_func_table[173]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetFilterMode(CUfilter_mode *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfilter_mode *, CUtexref); + ON_ENTRY(cuTexRefGetFilterMode); + f_ptr_t f = (f_ptr_t)(g_func_table[174]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetFormat(CUarray_format *arg0, int *arg1, CUtexref arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray_format *, int *, CUtexref); + ON_ENTRY(cuTexRefGetFormat); + f_ptr_t f = (f_ptr_t)(g_func_table[175]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapFilterMode(CUfilter_mode *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfilter_mode *, CUtexref); + ON_ENTRY(cuTexRefGetMipmapFilterMode); + f_ptr_t f = (f_ptr_t)(g_func_table[176]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapLevelBias(float *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(float *, CUtexref); + ON_ENTRY(cuTexRefGetMipmapLevelBias); + f_ptr_t f = (f_ptr_t)(g_func_table[177]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetMipmapLevelClamp(float *arg0, float *arg1, CUtexref arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(float *, float *, CUtexref); + ON_ENTRY(cuTexRefGetMipmapLevelClamp); + f_ptr_t f = (f_ptr_t)(g_func_table[178]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetMaxAnisotropy(int *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUtexref); + ON_ENTRY(cuTexRefGetMaxAnisotropy); + f_ptr_t f = (f_ptr_t)(g_func_table[179]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetBorderColor(float *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(float *, CUtexref); + ON_ENTRY(cuTexRefGetBorderColor); + f_ptr_t f = (f_ptr_t)(g_func_table[180]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetFlags(unsigned int *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int *, CUtexref); + ON_ENTRY(cuTexRefGetFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[181]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefCreate(CUtexref *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref *); + ON_ENTRY(cuTexRefCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[182]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuTexRefDestroy(CUtexref arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref); + ON_ENTRY(cuTexRefDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[183]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuSurfRefSetArray(CUsurfref arg0, CUarray arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUsurfref, CUarray, unsigned int); + ON_ENTRY(cuSurfRefSetArray); + f_ptr_t f = (f_ptr_t)(g_func_table[184]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuSurfRefGetArray(CUarray *arg0, CUsurfref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, CUsurfref); + ON_ENTRY(cuSurfRefGetArray); + f_ptr_t f = (f_ptr_t)(g_func_table[185]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexObjectCreate(CUtexObject *arg0, const CUDA_RESOURCE_DESC *arg1, const CUDA_TEXTURE_DESC *arg2, const CUDA_RESOURCE_VIEW_DESC *arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexObject *, const CUDA_RESOURCE_DESC *, const CUDA_TEXTURE_DESC *, const CUDA_RESOURCE_VIEW_DESC *); + ON_ENTRY(cuTexObjectCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[186]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuTexObjectDestroy(CUtexObject arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexObject); + ON_ENTRY(cuTexObjectDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[187]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *arg0, CUtexObject arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_RESOURCE_DESC *, CUtexObject); + ON_ENTRY(cuTexObjectGetResourceDesc); + f_ptr_t f = (f_ptr_t)(g_func_table[188]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *arg0, CUtexObject arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_TEXTURE_DESC *, CUtexObject); + ON_ENTRY(cuTexObjectGetTextureDesc); + f_ptr_t f = (f_ptr_t)(g_func_table[189]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *arg0, CUtexObject arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject); + ON_ENTRY(cuTexObjectGetResourceViewDesc); + f_ptr_t f = (f_ptr_t)(g_func_table[190]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuSurfObjectCreate(CUsurfObject *arg0, const CUDA_RESOURCE_DESC *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUsurfObject *, const CUDA_RESOURCE_DESC *); + ON_ENTRY(cuSurfObjectCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[191]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuSurfObjectDestroy(CUsurfObject arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUsurfObject); + ON_ENTRY(cuSurfObjectDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[192]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *arg0, CUsurfObject arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_RESOURCE_DESC *, CUsurfObject); + ON_ENTRY(cuSurfObjectGetResourceDesc); + f_ptr_t f = (f_ptr_t)(g_func_table[193]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuDeviceCanAccessPeer(int *arg0, CUdevice arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUdevice, CUdevice); + ON_ENTRY(cuDeviceCanAccessPeer); + f_ptr_t f = (f_ptr_t)(g_func_table[194]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuDeviceGetP2PAttribute(int *arg0, CUdevice_P2PAttribute arg1, CUdevice arg2, CUdevice arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice); + ON_ENTRY(cuDeviceGetP2PAttribute); + f_ptr_t f = (f_ptr_t)(g_func_table[195]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuCtxEnablePeerAccess(CUcontext arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext, unsigned int); + ON_ENTRY(cuCtxEnablePeerAccess); + f_ptr_t f = (f_ptr_t)(g_func_table[196]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxDisablePeerAccess(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxDisablePeerAccess); + f_ptr_t f = (f_ptr_t)(g_func_table[197]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuGraphicsUnregisterResource(CUgraphicsResource arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUgraphicsResource); + ON_ENTRY(cuGraphicsUnregisterResource); + f_ptr_t f = (f_ptr_t)(g_func_table[198]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuGraphicsSubResourceGetMappedArray(CUarray *arg0, CUgraphicsResource arg1, unsigned int arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, CUgraphicsResource, unsigned int, unsigned int); + ON_ENTRY(cuGraphicsSubResourceGetMappedArray); + f_ptr_t f = (f_ptr_t)(g_func_table[199]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *arg0, CUgraphicsResource arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUmipmappedArray *, CUgraphicsResource); + ON_ENTRY(cuGraphicsResourceGetMappedMipmappedArray); + f_ptr_t f = (f_ptr_t)(g_func_table[200]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedPointer_v2(CUdeviceptr *arg0, size_t *arg1, CUgraphicsResource arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr *, size_t *, CUgraphicsResource); + ON_ENTRY(cuGraphicsResourceGetMappedPointer_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[201]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuGraphicsResourceSetMapFlags_v2(CUgraphicsResource arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUgraphicsResource, unsigned int); + ON_ENTRY(cuGraphicsResourceSetMapFlags_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[202]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuGraphicsMapResources_ptsz(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUgraphicsResource *, CUstream); + ON_ENTRY(cuGraphicsMapResources_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[203]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuGraphicsUnmapResources_ptsz(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUgraphicsResource *, CUstream); + ON_ENTRY(cuGraphicsUnmapResources_ptsz); + f_ptr_t f = (f_ptr_t)(g_func_table[204]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuGetExportTable(const void **arg0, const CUuuid *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const void **, const CUuuid *); + ON_ENTRY(cuGetExportTable); + f_ptr_t f = (f_ptr_t)(g_func_table[205]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemHostRegister(void *arg0, size_t arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, size_t, unsigned int); + ON_ENTRY(cuMemHostRegister); + f_ptr_t f = (f_ptr_t)(g_func_table[206]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuGraphicsResourceSetMapFlags(CUgraphicsResource arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUgraphicsResource, unsigned int); + ON_ENTRY(cuGraphicsResourceSetMapFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[207]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuLinkCreate(unsigned int arg0, CUjit_option *arg1, void **arg2, CUlinkState *arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUjit_option *, void **, CUlinkState *); + ON_ENTRY(cuLinkCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[208]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuLinkAddData(CUlinkState arg0, CUjitInputType arg1, void *arg2, size_t arg3, const char *arg4, unsigned int arg5, CUjit_option *arg6, void **arg7) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **); + ON_ENTRY(cuLinkAddData); + f_ptr_t f = (f_ptr_t)(g_func_table[209]); + return f(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7); +} +CUresult _WRAPLIB_API_CALL cuLinkAddFile(CUlinkState arg0, CUjitInputType arg1, const char *arg2, unsigned int arg3, CUjit_option *arg4, void **arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **); + ON_ENTRY(cuLinkAddFile); + f_ptr_t f = (f_ptr_t)(g_func_table[210]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D_v2(CUtexref arg0, const CUDA_ARRAY_DESCRIPTOR *arg1, CUdeviceptr arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t); + ON_ENTRY(cuTexRefSetAddress2D_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[211]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuDeviceTotalMem(unsigned int *arg0, CUdevice arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int *, CUdevice); + ON_ENTRY(cuDeviceTotalMem); + f_ptr_t f = (f_ptr_t)(g_func_table[212]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuCtxCreate(CUcontext *arg0, unsigned int arg1, CUdevice arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *, unsigned int, CUdevice); + ON_ENTRY(cuCtxCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[213]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuModuleGetGlobal(CUdeviceptr_v1 *arg0, unsigned int *arg1, CUmodule arg2, const char *arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, unsigned int *, CUmodule, const char *); + ON_ENTRY(cuModuleGetGlobal); + f_ptr_t f = (f_ptr_t)(g_func_table[214]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemGetInfo(unsigned int *arg0, unsigned int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int *, unsigned int *); + ON_ENTRY(cuMemGetInfo); + f_ptr_t f = (f_ptr_t)(g_func_table[215]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemAlloc(CUdeviceptr_v1 *arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, unsigned int); + ON_ENTRY(cuMemAlloc); + f_ptr_t f = (f_ptr_t)(g_func_table[216]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemAllocPitch(CUdeviceptr_v1 *arg0, unsigned int *arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, unsigned int *, unsigned int, unsigned int, unsigned int); + ON_ENTRY(cuMemAllocPitch); + f_ptr_t f = (f_ptr_t)(g_func_table[217]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemFree(CUdeviceptr_v1 arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1); + ON_ENTRY(cuMemFree); + f_ptr_t f = (f_ptr_t)(g_func_table[218]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemGetAddressRange(CUdeviceptr_v1 *arg0, unsigned int *arg1, CUdeviceptr_v1 arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, unsigned int *, CUdeviceptr_v1); + ON_ENTRY(cuMemGetAddressRange); + f_ptr_t f = (f_ptr_t)(g_func_table[219]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemAllocHost(void **arg0, unsigned int arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void **, unsigned int); + ON_ENTRY(cuMemAllocHost); + f_ptr_t f = (f_ptr_t)(g_func_table[220]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemHostGetDevicePointer(CUdeviceptr_v1 *arg0, void *arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, void *, unsigned int); + ON_ENTRY(cuMemHostGetDevicePointer); + f_ptr_t f = (f_ptr_t)(g_func_table[221]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoD(CUdeviceptr_v1 arg0, const void *arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, const void *, unsigned int); + ON_ENTRY(cuMemcpyHtoD); + f_ptr_t f = (f_ptr_t)(g_func_table[222]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoH(void *arg0, CUdeviceptr_v1 arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUdeviceptr_v1, unsigned int); + ON_ENTRY(cuMemcpyDtoH); + f_ptr_t f = (f_ptr_t)(g_func_table[223]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoD(CUdeviceptr_v1 arg0, CUdeviceptr_v1 arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, CUdeviceptr_v1, unsigned int); + ON_ENTRY(cuMemcpyDtoD); + f_ptr_t f = (f_ptr_t)(g_func_table[224]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoA(CUarray arg0, unsigned int arg1, CUdeviceptr_v1 arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, unsigned int, CUdeviceptr_v1, unsigned int); + ON_ENTRY(cuMemcpyDtoA); + f_ptr_t f = (f_ptr_t)(g_func_table[225]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoD(CUdeviceptr_v1 arg0, CUarray arg1, unsigned int arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, CUarray, unsigned int, unsigned int); + ON_ENTRY(cuMemcpyAtoD); + f_ptr_t f = (f_ptr_t)(g_func_table[226]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoA(CUarray arg0, unsigned int arg1, const void *arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, unsigned int, const void *, unsigned int); + ON_ENTRY(cuMemcpyHtoA); + f_ptr_t f = (f_ptr_t)(g_func_table[227]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoH(void *arg0, CUarray arg1, unsigned int arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUarray, unsigned int, unsigned int); + ON_ENTRY(cuMemcpyAtoH); + f_ptr_t f = (f_ptr_t)(g_func_table[228]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoA(CUarray arg0, unsigned int arg1, CUarray arg2, unsigned int arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, unsigned int, CUarray, unsigned int, unsigned int); + ON_ENTRY(cuMemcpyAtoA); + f_ptr_t f = (f_ptr_t)(g_func_table[229]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync(CUarray arg0, unsigned int arg1, const void *arg2, unsigned int arg3, CUstream arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, unsigned int, const void *, unsigned int, CUstream); + ON_ENTRY(cuMemcpyHtoAAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[230]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync(void *arg0, CUarray arg1, unsigned int arg2, unsigned int arg3, CUstream arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUarray, unsigned int, unsigned int, CUstream); + ON_ENTRY(cuMemcpyAtoHAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[231]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2D(const CUDA_MEMCPY2D_v1 *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D_v1 *); + ON_ENTRY(cuMemcpy2D); + f_ptr_t f = (f_ptr_t)(g_func_table[232]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D_v1 *); + ON_ENTRY(cuMemcpy2DUnaligned); + f_ptr_t f = (f_ptr_t)(g_func_table[233]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3D(const CUDA_MEMCPY3D_v1 *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D_v1 *); + ON_ENTRY(cuMemcpy3D); + f_ptr_t f = (f_ptr_t)(g_func_table[234]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync(CUdeviceptr_v1 arg0, const void *arg1, unsigned int arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, const void *, unsigned int, CUstream); + ON_ENTRY(cuMemcpyHtoDAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[235]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync(void *arg0, CUdeviceptr_v1 arg1, unsigned int arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUdeviceptr_v1, unsigned int, CUstream); + ON_ENTRY(cuMemcpyDtoHAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[236]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync(CUdeviceptr_v1 arg0, CUdeviceptr_v1 arg1, unsigned int arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, CUdeviceptr_v1, unsigned int, CUstream); + ON_ENTRY(cuMemcpyDtoDAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[237]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D_v1 *, CUstream); + ON_ENTRY(cuMemcpy2DAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[238]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D_v1 *, CUstream); + ON_ENTRY(cuMemcpy3DAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[239]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemsetD8(CUdeviceptr_v1 arg0, unsigned char arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, unsigned char, unsigned int); + ON_ENTRY(cuMemsetD8); + f_ptr_t f = (f_ptr_t)(g_func_table[240]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD16(CUdeviceptr_v1 arg0, unsigned short arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, unsigned short, unsigned int); + ON_ENTRY(cuMemsetD16); + f_ptr_t f = (f_ptr_t)(g_func_table[241]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD32(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, unsigned int, unsigned int); + ON_ENTRY(cuMemsetD32); + f_ptr_t f = (f_ptr_t)(g_func_table[242]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D8(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned char arg2, unsigned int arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, unsigned int, unsigned char, unsigned int, unsigned int); + ON_ENTRY(cuMemsetD2D8); + f_ptr_t f = (f_ptr_t)(g_func_table[243]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D16(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned short arg2, unsigned int arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, unsigned int, unsigned short, unsigned int, unsigned int); + ON_ENTRY(cuMemsetD2D16); + f_ptr_t f = (f_ptr_t)(g_func_table[244]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D32(CUdeviceptr_v1 arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1, unsigned int, unsigned int, unsigned int, unsigned int); + ON_ENTRY(cuMemsetD2D32); + f_ptr_t f = (f_ptr_t)(g_func_table[245]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuArrayCreate(CUarray *arg0, const CUDA_ARRAY_DESCRIPTOR_v1 *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, const CUDA_ARRAY_DESCRIPTOR_v1 *); + ON_ENTRY(cuArrayCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[246]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *arg0, CUarray arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_ARRAY_DESCRIPTOR_v1 *, CUarray); + ON_ENTRY(cuArrayGetDescriptor); + f_ptr_t f = (f_ptr_t)(g_func_table[247]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuArray3DCreate(CUarray *arg0, const CUDA_ARRAY3D_DESCRIPTOR_v1 *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR_v1 *); + ON_ENTRY(cuArray3DCreate); + f_ptr_t f = (f_ptr_t)(g_func_table[248]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *arg0, CUarray arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUDA_ARRAY3D_DESCRIPTOR_v1 *, CUarray); + ON_ENTRY(cuArray3DGetDescriptor); + f_ptr_t f = (f_ptr_t)(g_func_table[249]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetAddress(unsigned int *arg0, CUtexref arg1, CUdeviceptr_v1 arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int *, CUtexref, CUdeviceptr_v1, unsigned int); + ON_ENTRY(cuTexRefSetAddress); + f_ptr_t f = (f_ptr_t)(g_func_table[250]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuTexRefSetAddress2D(CUtexref arg0, const CUDA_ARRAY_DESCRIPTOR_v1 *arg1, CUdeviceptr_v1 arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUtexref, const CUDA_ARRAY_DESCRIPTOR_v1 *, CUdeviceptr_v1, unsigned int); + ON_ENTRY(cuTexRefSetAddress2D); + f_ptr_t f = (f_ptr_t)(g_func_table[251]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuTexRefGetAddress(CUdeviceptr_v1 *arg0, CUtexref arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, CUtexref); + ON_ENTRY(cuTexRefGetAddress); + f_ptr_t f = (f_ptr_t)(g_func_table[252]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *arg0, unsigned int *arg1, CUgraphicsResource arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr_v1 *, unsigned int *, CUgraphicsResource); + ON_ENTRY(cuGraphicsResourceGetMappedPointer); + f_ptr_t f = (f_ptr_t)(g_func_table[253]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuCtxDestroy(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[254]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxPopCurrent(CUcontext *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext *); + ON_ENTRY(cuCtxPopCurrent); + f_ptr_t f = (f_ptr_t)(g_func_table[255]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuCtxPushCurrent(CUcontext arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUcontext); + ON_ENTRY(cuCtxPushCurrent); + f_ptr_t f = (f_ptr_t)(g_func_table[256]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuStreamDestroy(CUstream arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream); + ON_ENTRY(cuStreamDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[257]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuEventDestroy(CUevent arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent); + ON_ENTRY(cuEventDestroy); + f_ptr_t f = (f_ptr_t)(g_func_table[258]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoD_v2(CUdeviceptr arg0, const void *arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, const void *, size_t); + ON_ENTRY(cuMemcpyHtoD_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[259]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoH_v2(void *arg0, CUdeviceptr arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpyDtoH_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[260]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoD_v2(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpyDtoD_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[261]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoA_v2(CUarray arg0, size_t arg1, CUdeviceptr arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpyDtoA_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[262]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoD_v2(CUdeviceptr arg0, CUarray arg1, size_t arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUarray, size_t, size_t); + ON_ENTRY(cuMemcpyAtoD_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[263]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoA_v2(CUarray arg0, size_t arg1, const void *arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, const void *, size_t); + ON_ENTRY(cuMemcpyHtoA_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[264]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoH_v2(void *arg0, CUarray arg1, size_t arg2, size_t arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUarray, size_t, size_t); + ON_ENTRY(cuMemcpyAtoH_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[265]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoA_v2(CUarray arg0, size_t arg1, CUarray arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, CUarray, size_t, size_t); + ON_ENTRY(cuMemcpyAtoA_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[266]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoAAsync_v2(CUarray arg0, size_t arg1, const void *arg2, size_t arg3, CUstream arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUarray, size_t, const void *, size_t, CUstream); + ON_ENTRY(cuMemcpyHtoAAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[267]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAtoHAsync_v2(void *arg0, CUarray arg1, size_t arg2, size_t arg3, CUstream arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUarray, size_t, size_t, CUstream); + ON_ENTRY(cuMemcpyAtoHAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[268]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2D_v2(const CUDA_MEMCPY2D *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D *); + ON_ENTRY(cuMemcpy2D_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[269]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D *); + ON_ENTRY(cuMemcpy2DUnaligned_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[270]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3D_v2(const CUDA_MEMCPY3D *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D *); + ON_ENTRY(cuMemcpy3D_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[271]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpyHtoDAsync_v2(CUdeviceptr arg0, const void *arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, const void *, size_t, CUstream); + ON_ENTRY(cuMemcpyHtoDAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[272]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoHAsync_v2(void *arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(void *, CUdeviceptr, size_t, CUstream); + ON_ENTRY(cuMemcpyDtoHAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[273]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyDtoDAsync_v2(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t, CUstream); + ON_ENTRY(cuMemcpyDtoDAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[274]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY2D *, CUstream); + ON_ENTRY(cuMemcpy2DAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[275]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D *, CUstream); + ON_ENTRY(cuMemcpy3DAsync_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[276]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemsetD8_v2(CUdeviceptr arg0, unsigned char arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned char, size_t); + ON_ENTRY(cuMemsetD8_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[277]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD16_v2(CUdeviceptr arg0, unsigned short arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned short, size_t); + ON_ENTRY(cuMemsetD16_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[278]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD32_v2(CUdeviceptr arg0, unsigned int arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned int, size_t); + ON_ENTRY(cuMemsetD32_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[279]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D8_v2(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned char, size_t, size_t); + ON_ENTRY(cuMemsetD2D8_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[280]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D16_v2(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned short, size_t, size_t); + ON_ENTRY(cuMemsetD2D16_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[281]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D32_v2(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned int, size_t, size_t); + ON_ENTRY(cuMemsetD2D32_v2); + f_ptr_t f = (f_ptr_t)(g_func_table[282]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpy(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t); + ON_ENTRY(cuMemcpy); + f_ptr_t f = (f_ptr_t)(g_func_table[283]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemcpyAsync(CUdeviceptr arg0, CUdeviceptr arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUdeviceptr, size_t, CUstream); + ON_ENTRY(cuMemcpyAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[284]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemcpyPeer(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); + ON_ENTRY(cuMemcpyPeer); + f_ptr_t f = (f_ptr_t)(g_func_table[285]); + return f(arg0, arg1, arg2, arg3, arg4); +} +CUresult _WRAPLIB_API_CALL cuMemcpyPeerAsync(CUdeviceptr arg0, CUcontext arg1, CUdeviceptr arg2, CUcontext arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); + ON_ENTRY(cuMemcpyPeerAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[286]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D_PEER *); + ON_ENTRY(cuMemcpy3DPeer); + f_ptr_t f = (f_ptr_t)(g_func_table[287]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const CUDA_MEMCPY3D_PEER *, CUstream); + ON_ENTRY(cuMemcpy3DPeerAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[288]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuMemsetD8Async(CUdeviceptr arg0, unsigned char arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned char, size_t, CUstream); + ON_ENTRY(cuMemsetD8Async); + f_ptr_t f = (f_ptr_t)(g_func_table[289]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemsetD16Async(CUdeviceptr arg0, unsigned short arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned short, size_t, CUstream); + ON_ENTRY(cuMemsetD16Async); + f_ptr_t f = (f_ptr_t)(g_func_table[290]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemsetD32Async(CUdeviceptr arg0, unsigned int arg1, size_t arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, unsigned int, size_t, CUstream); + ON_ENTRY(cuMemsetD32Async); + f_ptr_t f = (f_ptr_t)(g_func_table[291]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D8Async(CUdeviceptr arg0, size_t arg1, unsigned char arg2, size_t arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream); + ON_ENTRY(cuMemsetD2D8Async); + f_ptr_t f = (f_ptr_t)(g_func_table[292]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D16Async(CUdeviceptr arg0, size_t arg1, unsigned short arg2, size_t arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream); + ON_ENTRY(cuMemsetD2D16Async); + f_ptr_t f = (f_ptr_t)(g_func_table[293]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuMemsetD2D32Async(CUdeviceptr arg0, size_t arg1, unsigned int arg2, size_t arg3, size_t arg4, CUstream arg5) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream); + ON_ENTRY(cuMemsetD2D32Async); + f_ptr_t f = (f_ptr_t)(g_func_table[294]); + return f(arg0, arg1, arg2, arg3, arg4, arg5); +} +CUresult _WRAPLIB_API_CALL cuStreamGetPriority(CUstream arg0, int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, int *); + ON_ENTRY(cuStreamGetPriority); + f_ptr_t f = (f_ptr_t)(g_func_table[295]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuStreamGetFlags(CUstream arg0, unsigned int *arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, unsigned int *); + ON_ENTRY(cuStreamGetFlags); + f_ptr_t f = (f_ptr_t)(g_func_table[296]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuStreamWaitEvent(CUstream arg0, CUevent arg1, unsigned int arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUevent, unsigned int); + ON_ENTRY(cuStreamWaitEvent); + f_ptr_t f = (f_ptr_t)(g_func_table[297]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuStreamAddCallback(CUstream arg0, CUstreamCallback arg1, void *arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUstreamCallback, void *, unsigned int); + ON_ENTRY(cuStreamAddCallback); + f_ptr_t f = (f_ptr_t)(g_func_table[298]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamAttachMemAsync(CUstream arg0, CUdeviceptr arg1, size_t arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUdeviceptr, size_t, unsigned int); + ON_ENTRY(cuStreamAttachMemAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[299]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamQuery(CUstream arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream); + ON_ENTRY(cuStreamQuery); + f_ptr_t f = (f_ptr_t)(g_func_table[300]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuStreamSynchronize(CUstream arg0) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream); + ON_ENTRY(cuStreamSynchronize); + f_ptr_t f = (f_ptr_t)(g_func_table[301]); + return f(arg0); +} +CUresult _WRAPLIB_API_CALL cuEventRecord(CUevent arg0, CUstream arg1) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUevent, CUstream); + ON_ENTRY(cuEventRecord); + f_ptr_t f = (f_ptr_t)(g_func_table[302]); + return f(arg0, arg1); +} +CUresult _WRAPLIB_API_CALL cuLaunchKernel(CUfunction arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4, unsigned int arg5, unsigned int arg6, unsigned int arg7, CUstream arg8, void **arg9, void **arg10) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **); + ON_ENTRY(cuLaunchKernel); + f_ptr_t f = (f_ptr_t)(g_func_table[303]); + return f(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); +} +CUresult _WRAPLIB_API_CALL cuGraphicsMapResources(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUgraphicsResource *, CUstream); + ON_ENTRY(cuGraphicsMapResources); + f_ptr_t f = (f_ptr_t)(g_func_table[304]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuGraphicsUnmapResources(unsigned int arg0, CUgraphicsResource *arg1, CUstream arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(unsigned int, CUgraphicsResource *, CUstream); + ON_ENTRY(cuGraphicsUnmapResources); + f_ptr_t f = (f_ptr_t)(g_func_table[305]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuMemPrefetchAsync(CUdeviceptr arg0, size_t arg1, CUdevice arg2, CUstream arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUdeviceptr, size_t, CUdevice, CUstream); + ON_ENTRY(cuMemPrefetchAsync); + f_ptr_t f = (f_ptr_t)(g_func_table[306]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamWriteValue32(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + ON_ENTRY(cuStreamWriteValue32); + f_ptr_t f = (f_ptr_t)(g_func_table[307]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamWaitValue32(CUstream arg0, CUdeviceptr arg1, cuuint32_t arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + ON_ENTRY(cuStreamWaitValue32); + f_ptr_t f = (f_ptr_t)(g_func_table[308]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuStreamBatchMemOp(CUstream arg0, unsigned int arg1, CUstreamBatchMemOpParams *arg2, unsigned int arg3) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int); + ON_ENTRY(cuStreamBatchMemOp); + f_ptr_t f = (f_ptr_t)(g_func_table[309]); + return f(arg0, arg1, arg2, arg3); +} +CUresult _WRAPLIB_API_CALL cuProfilerInitialize(const char *arg0, const char *arg1, CUoutput_mode arg2) { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(const char *, const char *, CUoutput_mode); + ON_ENTRY(cuProfilerInitialize); + f_ptr_t f = (f_ptr_t)(g_func_table[310]); + return f(arg0, arg1, arg2); +} +CUresult _WRAPLIB_API_CALL cuProfilerStart() { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(); + ON_ENTRY(cuProfilerStart); + f_ptr_t f = (f_ptr_t)(g_func_table[311]); + return f(); +} +CUresult _WRAPLIB_API_CALL cuProfilerStop() { + typedef CUresult (_WRAPLIB_API_CALL *f_ptr_t)(); + ON_ENTRY(cuProfilerStop); + f_ptr_t f = (f_ptr_t)(g_func_table[312]); + return f(); +} diff --git a/dnn/cuda-stub/src/libcuda.cpp b/dnn/cuda-stub/src/libcuda.cpp new file mode 100644 index 00000000..cf55fac7 --- /dev/null +++ b/dnn/cuda-stub/src/libcuda.cpp @@ -0,0 +1,140 @@ +/* + * LIBCUDA_PATH: candidate paths to libcuda.so; multiple paths are + * splitted by colons + **/ + +#pragma GCC visibility push(default) + +#include +#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) + +extern "C" { +#include +} +#include + +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +static const char* default_so_paths[] = { + "/usr/local/nvidia/lib64/libcuda.so", + "/usr/lib/x86_64-linux-gnu/libcuda.so", + "libcuda.so", +}; + +#if defined(_WIN32) +#include +#include +#define F_OK 0 +#define RTLD_LAZY 0 +// On the windows platform we use a lib_filename without a full path so +// the win-api "LoadLibrary" would uses a standard search strategy to +// find the lib module. As we cannot access to the lib_filename without a +// full path, we should not use "access(a, b)" to verify it. +#define access(a, b) false + +static void* dlopen(const char* file, int) { + return static_cast(LoadLibrary(file)); +} + +static void* dlerror() { + const char* errmsg = "dlerror not aviable in windows"; + return const_cast(errmsg); +} + +static void* dlsym(void* handle, const char* name) { + FARPROC symbol = GetProcAddress((HMODULE)handle, name); + return reinterpret_cast(symbol); +} + +#else +#include +#include +#endif + +static void log_failed_load(int func_idx); +namespace { +template +T on_init_failed(int func_idx); +template <> +CUresult on_init_failed(int func_idx) { + log_failed_load(func_idx); + return CUDA_ERROR_UNKNOWN; +} +} + +#define _WRAPLIB_API_CALL CUDAAPI +#define _WRAPLIB_CALLBACK CUDA_CB +#include "./libcuda-wrap.h" +#undef _WRAPLIB_CALLBACK +#undef _WRAPLIB_API_CALL + +static bool open_shared_lib(const char* path, void*& handle) { + if (!access(path, F_OK)) { + handle = dlopen(path, RTLD_LAZY); + if (handle) + return true; + LOGE("cuda lib found but can not be opened: %s err=%s", path, + dlerror()); + } + return false; +} + +static void* get_library_handle() { + const char* path = nullptr; + auto str_cptr = getenv("LIBCUDA_PATH"); + std::string str; + void* handle = nullptr; + + if (str_cptr) { + str = str_cptr; + char* p = &str[0]; + const char* begin = p; + while (*p) { + if (*p == ':') { + *p = 0; + if (open_shared_lib(begin, handle)) { + path = begin; + break; + } + begin = p + 1; + } + ++p; + } + if (open_shared_lib(begin, handle)) { + path = begin; + } + } + + if (!path) { + for (size_t i = 0; i < (sizeof(default_so_paths) / sizeof(char*)); + i++) { + if (open_shared_lib(default_so_paths[i], handle)) { + path = default_so_paths[i]; + break; + } + } + } + + if (!path) { + LOGE("can not find cuda"); + return nullptr; + } + return handle; +} + +static void log_failed_load(int func_idx) { + LOGE("failed to load cuda func: %s", g_func_name[func_idx]); +} + +static void* resolve_library_func(void* handle, const char* func) { + if (!handle) { + LOGE("handle should not be nullptr!"); + return nullptr; + } + auto ret = dlsym(handle, func); + if (!ret) { + LOGE("failed to load cuda func: %s", func); + } + return ret; +} + diff --git a/dnn/include/megcore.h b/dnn/include/megcore.h new file mode 100644 index 00000000..fead54e4 --- /dev/null +++ b/dnn/include/megcore.h @@ -0,0 +1,137 @@ +/** + * \file dnn/include/megcore.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/thin/function.h" +#include "megcore_cdefs.h" +#include +#include + +#include "megdnn/internal/visibility_prologue.h" + +namespace megcore { +/*! + * \brief a callback to dispatch computing task on desired CPU thread + * + * This is analogous to cuda streams. The default dispatcher on CPU executes in + * the caller thread immediately. + */ +class CPUDispatcher { + public: + using Task = megdnn::thin_function; + using MultiThreadingTask = megdnn::thin_function; + virtual ~CPUDispatcher() noexcept; + + /*! + * \brief dispatch a task on the computing thread + * \param task the task that would be moved away + */ + virtual void dispatch(Task&& task) = 0; + + /*! + * \brief dispatch a multithreading task on the computing thread + * \param task the task would be moved away + * \param parallelism the parallelism of the task. + */ + virtual void dispatch(MultiThreadingTask&& task, + size_t parallelism) = 0; + + /*! + * \brief synchronize the calling thread with the computing thread + */ + virtual void sync() = 0; + + /*! + * \brief the computing thread number. + */ + virtual size_t nr_threads() = 0; +}; +} // namespace megcore + +using MegcoreCPUDispatcher = megcore::CPUDispatcher; + +/** + * \brief Layer 1: device handle + */ +struct megcoreDeviceContext; +typedef struct megcoreDeviceContext *megcoreDeviceHandle_t; + +megcoreStatus_t megcoreCreateDeviceHandle( + megcoreDeviceHandle_t *handle, + megcorePlatform_t platform, + int deviceID = -1, + unsigned int flags = 0); +megcoreStatus_t megcoreDestroyDeviceHandle( + megcoreDeviceHandle_t handle); + +megcoreStatus_t megcoreGetPlatform(megcoreDeviceHandle_t handle, + megcorePlatform_t *platform); +megcoreStatus_t megcoreGetDeviceID(megcoreDeviceHandle_t handle, + int *deviceID); +megcoreStatus_t megcoreGetMemAlignment(megcoreDeviceHandle_t handle, + size_t *memAlignmentInBytes); +megcoreStatus_t megcoreGetDeviceFlags( + megcoreDeviceHandle_t handle, + unsigned int *flags); + +megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle); +megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, + void **devPtr, size_t sizeInBytes); +megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle, + void *devPtr); + +/** + * \brief Layer 2: computing handle + */ +struct megcoreComputingContext; +typedef struct megcoreComputingContext *megcoreComputingHandle_t; + +megcoreStatus_t megcoreCreateComputingHandle( + megcoreComputingHandle_t *compHandle, + megcoreDeviceHandle_t devHandle, + unsigned int flags = 0); + +megcoreStatus_t megcoreCreateComputingHandleWithCPUDispatcher( + megcoreComputingHandle_t *compHandle, + megcoreDeviceHandle_t devHandle, + const std::shared_ptr& dispatcher, + unsigned int flags = 0); + +megcoreStatus_t megcoreDestroyComputingHandle( + megcoreComputingHandle_t handle); + +megcoreStatus_t megcoreGetDeviceHandle( + megcoreComputingHandle_t compHandle, + megcoreDeviceHandle_t *devHandle); +megcoreStatus_t megcoreGetComputingFlags( + megcoreComputingHandle_t handle, + unsigned int *flags); + +MegcoreCPUDispatcher* megcoreGetCPUDispatcher(megcoreComputingHandle_t handle); + +megcoreStatus_t megcoreMemcpy( + megcoreComputingHandle_t handle, + void *dst, const void *src, size_t sizeInBytes, + megcoreMemcpyKind_t kind); +megcoreStatus_t megcoreMemset( + megcoreComputingHandle_t handle, + void *dst, int value, size_t sizeInBytes); +megcoreStatus_t megcoreSynchronize(megcoreComputingHandle_t handle); + +/** + * \brief Miscellaneous + */ +const char *megcoreGetErrorName(megcoreStatus_t status); + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megcore_cdefs.h b/dnn/include/megcore_cdefs.h new file mode 100644 index 00000000..eede205b --- /dev/null +++ b/dnn/include/megcore_cdefs.h @@ -0,0 +1,72 @@ +/** + * \file dnn/include/megcore_cdefs.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include + +/** + * \brief MegCore platform types + */ +typedef enum { + megcorePlatformCPU = 1, + megcorePlatformCUDA = 4, +} megcorePlatform_t; + +/** + * \brief MegCore return codes + * + * Note: since MegCore has been merged into MegDNN and uses C++ API with + * exception, this return status only serves for backward compatibility and all + * API would return megcoreSuccess + */ +typedef enum { + megcoreSuccess = 0, + megcoreErrorMemoryAllocation = 1, + megcoreErrorInvalidArgument = 2, + megcoreErrorInvalidDeviceHandle = 3, + megcoreErrorInvalidComputingHandle = 4, + megcoreErrorInternalError = 5, +} megcoreStatus_t; + + +/** + * \brief Memcpy kind + */ +typedef enum { + megcoreMemcpyHostToDevice = 1, + megcoreMemcpyDeviceToHost = 2, + megcoreMemcpyDeviceToDevice = 3, +} megcoreMemcpyKind_t; + +namespace megcore { +/*! + * \brief error reporting from asynchronous execution devices + * + * This is currently used by CUDA kernels. It is used to report errors that + * depend on input data. + */ +struct AsyncErrorInfo { + //! number of errors occurred; only detailed information of the first error + //! would be recorded + uint32_t nr_error; + + //! tracker set by set_error_tracker() + void* tracker_ptr; + + //! human readable message; it can contain %d which would be replaced by + //! msg_args + char msg[228]; + int msg_args[4]; +}; +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megcore_cuda.h b/dnn/include/megcore_cuda.h new file mode 100644 index 00000000..cf465df7 --- /dev/null +++ b/dnn/include/megcore_cuda.h @@ -0,0 +1,60 @@ +/** + * \file dnn/include/megcore_cuda.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "./megcore.h" + +#include + +#include "megdnn/internal/visibility_prologue.h" + +namespace megcore { +struct CudaContext { + cudaStream_t stream = nullptr; + + //! device pointer to buffer for error reporting from kernels + AsyncErrorInfo* error_info = nullptr; + + CudaContext() = default; + + CudaContext(cudaStream_t s, AsyncErrorInfo* e) : stream{s}, error_info{e} {} +}; + +megcoreStatus_t createComputingHandleWithCUDAContext( + megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, + unsigned int flags, const CudaContext& ctx); + +megcoreStatus_t getCUDAContext(megcoreComputingHandle_t handle, + CudaContext* ctx); + +} // namespace megcore + +static inline megcoreStatus_t megcoreCreateComputingHandleWithCUDAStream( + megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, + unsigned int flags, cudaStream_t stream) { + megcore::CudaContext ctx; + ctx.stream = stream; + return megcore::createComputingHandleWithCUDAContext(compHandle, devHandle, + flags, ctx); +} + +static inline megcoreStatus_t megcoreGetCUDAStream( + megcoreComputingHandle_t handle, cudaStream_t* stream) { + megcore::CudaContext ctx; + auto ret = megcore::getCUDAContext(handle, &ctx); + *stream = ctx.stream; + return ret; +} + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn.h b/dnn/include/megdnn.h new file mode 100644 index 00000000..e35dc520 --- /dev/null +++ b/dnn/include/megdnn.h @@ -0,0 +1,16 @@ +/** + * \file dnn/include/megdnn.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/version.h" +#include "megdnn/oprs.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/arch.h b/dnn/include/megdnn/arch.h new file mode 100644 index 00000000..bc912d64 --- /dev/null +++ b/dnn/include/megdnn/arch.h @@ -0,0 +1,136 @@ +/** + * \file dnn/include/megdnn/arch.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +// include general build configurations +#include "megdnn/config/config.h" + +#if defined(__GNUC__) || defined(__clang__) + #if !defined (__clang__) + // gcc specific + #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) + #if GCC_VERSION < 40800 + #error "GCC version should be at least 4.8.0." + #endif // GCC_VERSION < 40800 + #endif // !defined(__clang__) + + #ifndef megdnn_trap + #define megdnn_trap() __builtin_trap() + #endif + + #define megdnn_likely(v) __builtin_expect(bool(v), 1) + #define megdnn_unlikely(v) __builtin_expect(bool(v), 0) + + #define MEGDNN_DEPRECATED __attribute__((deprecated)) + #define MEGDNN_PACKED __attribute__((packed)) + #define MEGDNN_CONSTEXPR constexpr + #define MEGDNN_NOEXCEPT noexcept + #define MEGDNN_STATIC_ASSERT static_assert + #define MEGDNN_FINAL final + #define MEGDNN_NORETURN __attribute__((noreturn)) + #define MEGDNN_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) + #define MEGDNN_ATTRIBUTE_TARGET(simd) __attribute__((target(simd))) + #if defined(__clang_major__) && (__clang_major__ >= 7) + #define MEGDNN_LAMBDA_ATTRIBUTE_TARGET(simd) __attribute__((target(simd))) + #else + #define MEGDNN_LAMBDA_ATTRIBUTE_TARGET(simd) [[gnu::target(simd)]] + #endif + #define MEGDNN_NOINLINE __attribute__((noinline)) + + #define megdnn_isatty(x) isatty(x) +#elif defined(__INTEL_COMPILER) || defined(_MSC_VER) + +#ifndef megdnn_trap +#define megdnn_trap() __debugbreak() +#endif + +#define megdnn_likely(v) (bool(v)) +#define megdnn_unlikely(v) (bool(v)) + +#define MEGDNN_DEPRECATED +#define MEGDNN_PACKED +#define MEGDNN_CONSTEXPR constexpr +#define MEGDNN_NOEXCEPT noexcept +#define MEGDNN_STATIC_ASSERT static_assert +#define MEGDNN_FINAL final + +#if defined(_MSC_VER) + #define MEGDNN_NORETURN __declspec(noreturn) + #define MEGDNN_NOINLINE __declspec(noinline) +#else + #define MEGDNN_NORETURN + #define MEGDNN_FORCE_NOINLINE +#endif // _MSC_VER + +#define MEGDNN_WARN_UNUSED_RESULT + +#define megdnn_isatty(x) _isatty(x) + +#else + #error "unknown compiler" +#endif // __GNUC__ + +// __cpp_exceptions and __cpp_rtti is referred from +// https://isocpp.org/std/standing-documentssd-6-sg10-feature-test-recommendations +// gcc < 5 does not define __cpp_exceptions but __EXCEPTIONS, +// similar for __GXX_RTTI +// _CPPUNWIND and _CPPRTTI is used by MSVC, see +// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macrosview=vs-2019 +#ifndef MEGDNN_ENABLE_EXCEPTIONS + #if __cpp_exceptions || __EXCEPTIONS || \ + (defined(_MSC_VER) && defined(_CPPUNWIND)) + #define MEGDNN_ENABLE_EXCEPTIONS 1 + #else + #define MEGDNN_ENABLE_EXCEPTIONS 0 + #endif +#endif +#ifndef MEGDNN_ENABLE_RTTI + #if __cpp_rtti || __GXX_RTTI || (defined(_MSC_VER) && defined(_CPPRTTI)) + #define MEGDNN_ENABLE_RTTI 1 + #else + #define MEGDNN_ENABLE_RTTI 0 + #endif +#endif + +#ifdef __CUDACC__ + #define MEGDNN_CC_CUDA 1 + #undef MEGDNN_CONSTEXPR + #define MEGDNN_CONSTEXPR const + +#if defined(__CUDACC_VER_MAJOR__) +#if __CUDACC_VER_MAJOR__ >= 9 + #undef MEGDNN_STATIC_ASSERT + #define MEGDNN_STATIC_ASSERT(cond, msg) static_assert(cond, msg); +#else + #undef MEGDNN_STATIC_ASSERT + #define MEGDNN_STATIC_ASSERT(cond, msg) +#endif +#endif + + #define nullptr NULL + #undef MEGDNN_FINAL + #define MEGDNN_FINAL +#elif defined(__HIPCC__) + #define MEGDNN_CC_CUDA 1 +#else + #define MEGDNN_CC_HOST 1 +#endif // __CUDACC__ + +// MEGDNN_HOST and MEGDNN_DEVICE +#if MEGDNN_CC_CUDA + #define MEGDNN_HOST __host__ + #define MEGDNN_DEVICE __device__ +#else + #define MEGDNN_HOST + #define MEGDNN_DEVICE +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h new file mode 100644 index 00000000..6c8c8cf1 --- /dev/null +++ b/dnn/include/megdnn/basic_types.h @@ -0,0 +1,513 @@ +/** + * \file dnn/include/megdnn/basic_types.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/arch.h" +#include "megdnn/dtype.h" +#include "megdnn/internal/defs.h" + +#if MEGDNN_CC_HOST +#include +#include +#include +#include +#include "megdnn/thin/small_vector.h" +#endif // MEGDNN_CC_HOST + +#include "megdnn/internal/visibility_prologue.h" + +namespace megdnn { + +class ErrorHandler { +#if MEGDNN_CC_HOST + static ErrorHandler* sm_inst; + static ErrorHandler* inst(); + +protected: + MEGDNN_NORETURN virtual void do_on_megdnn_error(const std::string& msg) = 0; + + MEGDNN_NORETURN virtual void do_on_tensor_reshape_error( + const std::string& msg) { + on_megdnn_error(msg); + } + + ~ErrorHandler() = default; + +#endif +public: + //! called on general megdnn error + MEGDNN_NORETURN static void on_megdnn_error(const char* msg); + + //! called on tensor reshape error + MEGDNN_NORETURN static void on_tensor_reshape_error(const char* msg); + +#if MEGDNN_CC_HOST + MEGDNN_NORETURN static void on_megdnn_error(const std::string& msg); + MEGDNN_NORETURN static void on_tensor_reshape_error(const std::string& msg); + + /*! + * \brief set the global error handler instance + * + * This method is not thread-safe. The caller is responsible to ensure the + * ErrorHandler is a global object with enough life span. + * + * \return original error handler + */ + static void set_handler(ErrorHandler* handler); + +#endif // MEGDNN_CC_HOST +}; + +#if MEGDNN_CC_HOST +enum class LogLevel { DEBUG, INFO, WARN, ERROR }; + +typedef void (*LogHandler)(LogLevel level, const char* file, const char* func, + int line, const char* fmt, va_list ap); + +/*! + * \brief set the callback to receive all log messages + * + * Note: the log handler can be NULL (which is also the default value). In this + * case, no log message would be recorded. + * + * \return original log handler + */ +LogHandler set_log_handler(LogHandler handler); +#endif + +/** + * \brief Describing the tensor shape. + * + * Uninitialized shape: ndim == 0; total_nr_elems() is also defined to be 0 + * + * Empty shape: ndim > 0 && shape[i] == 0 for 0 <= i < ndim; it is always + * considered non-contiguous. + */ +struct TensorShape { + static MEGDNN_CONSTEXPR size_t MAX_NDIM = MEGDNN_MAX_NDIM; + +#if MEGDNN_CC_HOST + size_t shape[MAX_NDIM], ndim = 0; +#else + size_t shape[MAX_NDIM], ndim; +#endif + +#if MEGDNN_CC_HOST + TensorShape() = default; + TensorShape(const TensorShape& rhs) = default; + TensorShape(const SmallVector& init_shape); + TensorShape(std::initializer_list init_shape); + std::string to_string() const; +#endif + + //! total number of elements + size_t total_nr_elems() const; + + //! check whether two shapes are equal + bool eq_shape(const TensorShape& rhs) const; + + //! check whether the shape can be treated as a scalar + bool is_scalar() const { return ndim == 1 && shape[0] == 1; } + + //! check whether ndim != 0 and at least one shape is 0 + bool is_empty() const; + + //! access single element, without boundary check + size_t& operator[](size_t i) { return shape[i]; } + size_t operator[](size_t i) const { return shape[i]; } +}; + +class Handle; +/** + * \brief Describing the tensor shape with its actual layout in memory and dtype + * + * x(i, j, ...) is stored at offset + * stride[0]*i + stride[1]*j + ..., in number of elements; physical offset needs + * to be multiplied by dtype size. + */ +struct TensorLayout : public TensorShape { + /*! + * \brief Describes min and max offsets of tensor elements with respect to + * its first element, so all tensor elements are guaranteed to be in + * the range [elem[0]+low, elem[0]+high). + */ + struct Span { + ptrdiff_t low_elem, low_byte; + size_t high_elem, high_byte; + + Span(ptrdiff_t low_elem, ptrdiff_t low_byte, size_t high_elem, + size_t high_byte) + : low_elem(low_elem), + low_byte(low_byte), + high_elem(high_elem), + high_byte(high_byte) {} + size_t dist_elem() const { return high_elem - low_elem; } + + size_t dist_byte() const { return high_byte - low_byte; } + }; + + /*! + * \brief Describing the requirements for tensor layouts + * + * Some runtime (e.g. opencl) may have alignment requirements for special + * memory types (e.g. image in texture memory). Format objects can be used + * to impose such constraints on methods related to tensor strides. + * + * Note that ImplBase is defined in tensor_format.h + */ + class Format { + public: + class ImplBase; + +#if MEGDNN_CC_HOST + Format(); + + const ImplBase* impl() const { return m_impl; } + + enum class Type; + + //! get impl type; defined in tensor_format.h + inline Type type() const; + + //! convert to the implementation class; exception would be raised if + //! type mismatches + template + const Impl& as_impl() const { + static_assert(std::is_base_of::value, "bad type"); + if (type() != Impl::TYPE) { + on_bad_cvt(Impl::TYPE); + } + return *static_cast(m_impl); + } + + //! get human-readable string description of this format + std::string to_string() const; + + std::string serialize() const; + static Format deserialize(const std::string& bin, const Handle* handle); + + //! whether this is the default tensor format + bool is_default() const; + + bool operator==(Format rhs) const { return m_impl == rhs.m_impl; } + bool operator!=(Format rhs) const { return m_impl != rhs.m_impl; } +#endif + + private: + const ImplBase* m_impl; + +#if MEGDNN_CC_HOST + Format(ImplBase* impl) : m_impl{impl} {} + MEGDNN_NORETURN void on_bad_cvt(Type dst_type) const; +#endif + }; + + ptrdiff_t stride[MAX_NDIM]; + DType dtype; + Format format; + +#if MEGDNN_CC_HOST + TensorLayout(); + + TensorLayout(const TensorLayout& layout) = default; + + //! create empty layout with given dtype + explicit TensorLayout(DType dtype_); + + TensorLayout(DType dtype_, Format format); + + //! create layout with given shape and contiguous stride. + TensorLayout(const TensorShape& shape, DType dtype); + + TensorLayout(const TensorShape& shape, DType dtype, Format format); + + //! creating layout with user-specified shape and stride. + TensorLayout(const TensorShape& shape, const std::vector& stride, + DType dtype); + + TensorLayout(const TensorShape& shape, const std::vector& stride, + DType dtype, Format format); + + /* =================== inplace modifiers =================== */ + + /*! + * \brief init stride to be contiguous + * + * Use current shape and format + * + * \return total number of elements + */ + size_t init_contiguous_stride(); + + /*! + * \brief init stride to be contiguous by first assigning shape + * + * Use current format. + */ + size_t init_contiguous_stride(const TensorShape& shape); + + size_t init_contiguous_stride(const TensorShape& shape, Format format); + + /*! + * \brief inplace version of remove_axis + */ + void remove_axis_inplace(size_t idx); + + /*! + * \brief add an axis before given *axis* with given shape and stride + * + * Other shapes and strides would not be changed. + */ + void add_axis_inplace(size_t axis, size_t shape, ptrdiff_t stride); + + /*! + * \brief add an axis before given *axis*, with shape 1 and contiguous + * stride + */ + void add_axis_cont_inplace(size_t axis) { + add_axis_inplace(axis, 1, stride[axis] * shape[axis]); + } + + /* =================== generate new layout =================== */ + + /** + * \brief Returns the layout with permuted dimensions. + * + * example: + * (2, 0, 1) -> AxBxC to CxAxB + */ + TensorLayout dimshuffle(const std::vector& dims) const + MEGDNN_WARN_UNUSED_RESULT; + + /** + * \brief Remove an axis from the layout by moving later shape/stride + * elements earlier. No extra check is performed. + */ + TensorLayout remove_axis(size_t idx) const MEGDNN_WARN_UNUSED_RESULT; + + /** + * \brief Returns a different view. + * + * \throw TensorReshapeError if no stride exists for target shape. + */ + TensorLayout reshape(const TensorShape& shape) const + MEGDNN_WARN_UNUSED_RESULT; + + /*! + * \brief try to reshape to another view; return whether these two shapes + * are compatible + * \return true iff there exists target stride so this layout can be + * converted to target shape and the elements can match. + */ + bool try_reshape(TensorLayout& output, + const TensorShape& shape) const MEGDNN_WARN_UNUSED_RESULT; + + /*! + * \brief Broadcast on dims with shape == 1 to match target *shape*. + * \throw TensorReshapeError if could not be satisfied + */ + TensorLayout broadcast(const TensorShape& shape) const + MEGDNN_WARN_UNUSED_RESULT; + + /*! + * \brief Collapse consecutive axes with contiguous layout together + * + * This transforms the tensor into a canonized form. For empty tensors or + * scalar, the result would always be a one-dimensional empty or scalar, + * with stride being 1. + */ + TensorLayout collapse_contiguous() const MEGDNN_WARN_UNUSED_RESULT; + + /* =================== properties =================== */ + + std::string to_string() const; +#endif // MEGDNN_CC_HOST + + /*! + * \brief check whether the is contiguous under its format definition + * + * See is_contiguous_spec() in Format impl classes for more detail. When the + * format is default, this is equivalent to is_physical_contiguous(). + * + * Note that empty tensors (i.e. with 0 shapes) are not considered as + * contiguous. + */ + bool is_contiguous() const; + + //! check whether it is physically contiguous disregarding format + bool is_physical_contiguous() const; + + /*! + * \brief check whether the layout is monotonous + * + * A tensor is monotonous if abs(stride[i]) >= abs(stride[i+1])*shape[i+1] + */ + bool is_abs_monotonous_allow_brdcst() const; + + /*! + * \brief check whether the layout is contiguous, allowing broadcasting + * + * This checks whether the underlying storage is contiguous, where + * broadcasting is also considered to be so. + */ + bool is_contiguous_allow_brdcst() const; + + /*! + * \brief if this function returns true, then no two elements can occupy the + * same memory slot + * + * Note that this test is a sufficient but not necessary condition for the + * layout being non-overlapping: when this function returns false, it is + * still possible that actually no two elements share the same memory + * location. + */ + bool is_non_overlapping_strong() const; + + bool eq_layout(const TensorLayout& rhs) const; + + //! get lowest and highest offset reachable from this layout + Span span() const; +}; + +/** + * \brief A simple encapsulation class for n-dimensional tensor. + */ +struct TensorND { + void* raw_ptr; + TensorLayout layout; + + TensorND() : raw_ptr(NULL) {} + + TensorND(void* raw_ptr_, const TensorLayout& layout_) + : raw_ptr(raw_ptr_), layout(layout_) {} + + //! get typed pointer; type check is performed + template + T* ptr() const { + layout.dtype.assert_is_ctype(); + return static_cast(raw_ptr); + } + + //! get typed pointer of compatible type + template + T* compatible_ptr() const { + layout.dtype.assert_is_compatible_ctype(); + return reinterpret_cast(raw_ptr); + } +}; + +#if MEGDNN_CC_HOST +using TensorFormat = TensorLayout::Format; +using TensorShapeArray = SmallVector; +using TensorNDArray = SmallVector; +using TensorLayoutArray = SmallVector; +using TensorLayoutPtrArray = SmallVector; +using TensorFormatArray = SmallVector; +#endif + +/** + * \brief A struct representing workspace. + * + * It differs from TensorND in that workspace does not have a "layout" concept. + */ +struct Workspace { + dt_byte* raw_ptr; + size_t size; + + Workspace() : raw_ptr(NULL), size(0) {} + + Workspace(dt_byte* raw_ptr_, size_t size_) + : raw_ptr(raw_ptr_), size(size_) {} + + template + T* ptr(size_t offset_in_bytes = 0) const { + return static_cast(static_cast(raw_ptr + offset_in_bytes)); + } +}; + +#if MEGDNN_CC_HOST + +/*! + * \brief manage output and workspace memory for dynamic output oprs + */ +class DynOutMallocPolicy { +protected: + ~DynOutMallocPolicy() = default; + +public: + /*! + * \brief allocate an output var + * \param id output index, starting from 0 + * \param dtype requested output data type + * \param shape requested output shape + * \param user_data extra user data passed in DynOutMallocPolicyCall + */ + virtual TensorND alloc_output(size_t id, DType dtype, + const TensorShape& shape, + void* user_data) = 0; + + /*! + * \brief allocate workspace memory + * \param sz requested workspace in bytes + */ + virtual void* alloc_workspace(size_t sz, void* user_data) = 0; + + /*! + * \brief free workspace memory + * + * Every operator should guarantee that alloc_workspace() and + * free_workspace() calls are matched + */ + virtual void free_workspace(void* ptr, void* user_data) = 0; +}; + +/*! + * \brief bind a DynOutMallocPolicy with arbitrary user data + */ +struct DynOutMallocPolicyCall { + DynOutMallocPolicy* policy; + void* user_data; + + DynOutMallocPolicyCall(DynOutMallocPolicy* p = nullptr, void* ud = nullptr) + : policy{p}, user_data{ud} {} + + TensorND alloc_output(size_t id, DType dtype, const TensorShape& shape) { + return policy->alloc_output(id, dtype, shape, user_data); + } + + /*! + * \brief allocate workspace with return type conversion + * \tparam elem element type for size calculation + * \param nr_elem number of elements; allocated size is sizeof(elem) * + * nr_elem + */ + template + T* alloc_workspace(size_t nr_elem) { + using real_elem = + typename std::conditional::value, + uint8_t, elem>::type; + return static_cast(policy->alloc_workspace( + nr_elem * sizeof(real_elem), user_data)); + } + + void free_workspace(void* ptr) { + return policy->free_workspace(ptr, user_data); + } +}; + +#endif // MEGDNN_CC_HOST + +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/config/config.h b/dnn/include/megdnn/config/config.h new file mode 100644 index 00000000..5f144f2a --- /dev/null +++ b/dnn/include/megdnn/config/config.h @@ -0,0 +1,31 @@ +/** + * \file dnn/include/megdnn/config/config.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#if !defined(__CUDACC__) + +// Try to detect if no architecture flags defined. +#if !defined(MEGDNN_NAIVE) && !defined(MEGDNN_X86) && \ + !defined(MEGDNN_X86_64) && !defined(MEGDNN_X86_32) && \ + !defined(MEGDNN_64_BIT) && !defined(MEGDNN_MIPS) && \ + !defined(MEGDNN_ARMV7) && !defined(MEGDNN_AARCH64) +#if defined(__x86_64__) || defined(_M_X64) +#define MEGDNN_X86 1 +#define MEGDNN_X86_64 1 +#define MEGDNN_64_BIT 1 +#elif defined(__i386) || defined(_M_IX86) +#define MEGDNN_X86 1 +#define MEGDNN_X86_32 1 +#endif +#endif + +#endif // !defined(__CUDACC__) + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/cuda.h b/dnn/include/megdnn/cuda.h new file mode 100644 index 00000000..afed2cfd --- /dev/null +++ b/dnn/include/megdnn/cuda.h @@ -0,0 +1,27 @@ +/** + * \file dnn/include/megdnn/cuda.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/basic_types.h" + +#include +#include + +#include "megdnn/internal/visibility_prologue.h" +namespace megdnn { + +std::unique_ptr make_cuda_handle_with_stream(cudaStream_t stream, + int device_id = -1); +cudaStream_t get_cuda_stream(Handle *handle); + +} // namespace megdnn +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/dtype.h b/dnn/include/megdnn/dtype.h new file mode 100644 index 00000000..aae14fec --- /dev/null +++ b/dnn/include/megdnn/dtype.h @@ -0,0 +1,965 @@ +/** + * \file dnn/include/megdnn/dtype.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "megdnn/arch.h" + +#include +#include +#include +#include + +#ifdef MEGDNN_CC_HOST +#include +#include +#endif + +#include "megdnn/internal/visibility_prologue.h" + +#if MEGDNN_DISABLE_FLOAT16 +#define MEGDNN_INC_FLOAT16(_x) +#define MEGDNN_FLOAT16_SELECT(_x, _y) _y +#else +#include "megdnn/dtype/half.hpp" +#define MEGDNN_INC_FLOAT16(_x) _x +#define MEGDNN_FLOAT16_SELECT(_x, _y) _x +#endif + +namespace megdnn { + +/*! + * \brief iterate through each dtype name + */ +#define MEGDNN_FOREACH_DTYPE_NAME(cb) \ + cb(Float32) \ + cb(Uint8) \ + cb(Int8) \ + cb(Int16) \ + cb(Int32) \ + cb(IntB1) \ + cb(IntB2) \ + cb(IntB4) \ + cb(Byte) \ + MEGDNN_INC_FLOAT16(cb(Float16)) \ + cb(UintB4) \ + +/*! + * \brief iterate through each full byte dtype + */ +#define MEGDNN_FOREACH_FULL_BYTE_DTYPE(cb) \ + cb(Float32) \ + cb(Uint8) \ + cb(Int8) \ + cb(Int16) \ + cb(Int32) \ + cb(Byte) \ + MEGDNN_INC_FLOAT16(cb(Float16)) \ + +/*! + * \brief iterate through each fractional byte dtype + */ +#define MEGDNN_FOREACH_LOWBIT_DTYPE(cb) \ + cb(IntB, 1)\ + cb(IntB, 2)\ + cb(IntB, 4)\ + cb(UintB, 4)\ + +// This is used to make enum definition possible. +#define MEGDNN_FOREACH_PARAMETERIZED_DTYPE_FIRST(cb) \ + cb(Quantized8Asymm) + +#define MEGDNN_FOREACH_PARAMETERIZED_DTYPE_OTHERS(cb) \ + cb(QuantizedS32) \ + cb(QuantizedS8) \ + cb(Quantized4Asymm) \ + cb(QuantizedS4) \ + cb(QuantizedS16) + +#define MEGDNN_FOREACH_PARAMETERIZED_DTYPE_2(cb_first, cb_others) \ + MEGDNN_FOREACH_PARAMETERIZED_DTYPE_FIRST(cb_first) \ + MEGDNN_FOREACH_PARAMETERIZED_DTYPE_OTHERS(cb_others) + +/*! + * \brief iterate through each parameterized dtype + */ +#define MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) \ + MEGDNN_FOREACH_PARAMETERIZED_DTYPE_FIRST(cb) \ + MEGDNN_FOREACH_PARAMETERIZED_DTYPE_OTHERS(cb) + +/*! + * \brief iterate through each dtype object that can be involved in float + * numeric computing + */ +#define MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) \ + cb(::megdnn::dtype::Float32) \ + MEGDNN_INC_FLOAT16(cb(::megdnn::dtype::Float16)) \ + +/*! + * \brief iterate through each dtype object that can be involved in integer + * numeric computing + */ +#define MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) \ + cb(::megdnn::dtype::Int32) \ + cb(::megdnn::dtype::Int16) \ + cb(::megdnn::dtype::Int8) \ + cb(::megdnn::dtype::Uint8) \ + +/*! + * \brief iterate through each dtype object that can be involved in numeric + * computing (i.e. dtypes except Byte) + */ +#define MEGDNN_FOREACH_COMPUTING_DTYPE(cb) \ + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) \ + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) + +//! In order to avoid an unnecessary increase in binary size, we just +//! use QuantizedS16 dtype in winograd_filter_preprocess now. So I didn't add +//! this data type here. +#define MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) \ + cb(::megdnn::dtype::Quantized8Asymm) \ + cb(::megdnn::dtype::QuantizedS32) \ + cb(::megdnn::dtype::QuantizedS8) \ + +#define MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb) \ + cb(::megdnn::dtype::Quantized4Asymm) \ + cb(::megdnn::dtype::QuantizedS4) + +/*! + * \brief a POD representation of a single byte + * + * Byte is used as storage of unspecific raw data, and should not be involved in + * any computing. + */ +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-private-field" +#endif +class dt_byte { + unsigned char _; + + public: + + //! convert to given type + template + T* as() { + return reinterpret_cast(this); + } + + //! convert to given type + template + const T* as() const { + return reinterpret_cast(this); + } +} MEGDNN_PACKED; + +#define DEFINE_LOWBIT(_name, b) \ + class dt_##_name##b {\ + unsigned char _;\ + } MEGDNN_PACKED; +MEGDNN_FOREACH_LOWBIT_DTYPE(DEFINE_LOWBIT) +#undef DEFINE_LOWBIT + +class dt_quint8 { + uint8_t _; + + public: + //! Convert to normal uint8_t + MEGDNN_DEVICE uint8_t as_uint8() const { + return _; + } + + MEGDNN_HOST MEGDNN_DEVICE explicit dt_quint8(uint8_t val):_(val) {} +#ifdef MEGDNN_CC_HOST + explicit operator uint8_t() { return _; } +#endif + bool operator<(const dt_quint8& b) const { return _ < b._; } + bool operator>(const dt_quint8& b) const { return _ > b._; } +} MEGDNN_PACKED; + +class dt_qint32 { + int32_t _; + + public: + //! Convert to normal uint32_t + MEGDNN_DEVICE int32_t as_int32() const { + return _; + } + + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qint32(int32_t val):_(val) {} +#ifdef MEGDNN_CC_HOST + explicit operator int32_t() { return _; } +#endif + dt_qint32 operator*(const dt_qint32& b) const { + return dt_qint32(_ * b._); + } + dt_qint32 operator+(const dt_qint32& b) const { + return dt_qint32(_ + b._); + } + dt_qint32 operator-(const dt_qint32& b) const { + return dt_qint32(_ - b._); + } +#ifdef MEGDNN_CC_HOST + dt_qint32 operator/(int b) const { + return dt_qint32(std::round(_ / static_cast(b))); + } + dt_qint32 operator/(const dt_qint32& b) const { + return dt_qint32(std::round(_ / static_cast(b._))); + } +#endif + dt_qint32 operator+=(const dt_qint32& b) { + _ += b._; + return *this; + } + bool operator<(const dt_qint32& b) const { return _ < b._; } + bool operator>(const dt_qint32& b) const { return _ > b._; } +} MEGDNN_PACKED; + +class dt_qint8 { + int8_t _; + + public: + MEGDNN_DEVICE int8_t as_int8() const { + return _; + } + + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qint8(int8_t val):_(val) {} +#ifdef MEGDNN_CC_HOST + explicit operator int8_t() { return _; } +#endif + bool operator<(const dt_qint8& b) const { return _ < b._; } + bool operator>(const dt_qint8& b) const { return _ > b._; } +} MEGDNN_PACKED; + +class dt_qint16 { + int16_t _; + + public: + //! Convert to normal int16_t + MEGDNN_DEVICE int16_t as_int16() const { + return _; + } + + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qint16(int16_t val):_(val) {} +#ifdef MEGDNN_CC_HOST + explicit operator int16_t() { return _; } +#endif + dt_qint16 operator*(const dt_qint16& b) const { + return dt_qint16(_ * b._); + } + dt_qint16 operator+(const dt_qint16& b) const { + return dt_qint16(_ + b._); + } + dt_qint16 operator-(const dt_qint16& b) const { + return dt_qint16(_ - b._); + } +#ifdef MEGDNN_CC_HOST + dt_qint16 operator/(int b) const { + return dt_qint16(std::round(_ / static_cast(b))); + } + dt_qint16 operator/(const dt_qint16& b) const { + return dt_qint16(std::round(_ / static_cast(b._))); + } +#endif + dt_qint16 operator+=(const dt_qint16& b) { + _ += b._; + return *this; + } + bool operator<(const dt_qint16& b) const { return _ < b._; } + bool operator>(const dt_qint16& b) const { return _ > b._; } +} MEGDNN_PACKED; + +template +class dt_qulowbit { + uint8_t _; + public: + //! Convert to normal uint8_t + MEGDNN_DEVICE uint8_t as_uint8() const { + return _; + } + + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qulowbit(uint8_t val):_(val) {} +#ifdef MEGDNN_CC_HOST + explicit operator uint8_t() { return _; } +#endif + bool operator<(const dt_qulowbit& b) const { return _ < b._; } + bool operator>(const dt_qulowbit& b) const { return _ > b._; } + + dt_qulowbit& operator=(const uint8_t val) { + _ = val; + return *this; + } +}; +using dt_quint4 = dt_qulowbit<4>; + +template +class dt_qlowbit { + int8_t _; + + public: + //! Convert to normal int8_t + MEGDNN_DEVICE int8_t as_int8() const { + return _; + } + + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qlowbit(int8_t val):_(val) {} +#ifdef MEGDNN_CC_HOST + explicit operator int8_t() { return _; } +#endif + bool operator<(const dt_qlowbit& b) const { return _ < b._; } + bool operator>(const dt_qlowbit& b) const { return _ > b._; } + + dt_qlowbit& operator=(const int8_t val) { + _ = val; + return *this; + } +}; +using dt_qint4 = dt_qlowbit<4>; + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +MEGDNN_STATIC_ASSERT(sizeof(dt_byte) == 1, "bad dt_byte size"); +MEGDNN_STATIC_ASSERT(sizeof(dt_quint8) == 1, "bad dt_quint8 size"); +MEGDNN_STATIC_ASSERT(sizeof(dt_qint16) == 2, "bad dt_qint16 size"); +MEGDNN_STATIC_ASSERT(sizeof(dt_qint32) == 4, "bad dt_qint32 size"); +typedef float dt_float32; +typedef int32_t dt_int32; +typedef int16_t dt_int16; +typedef int8_t dt_int8; +typedef uint8_t dt_uint8; +MEGDNN_INC_FLOAT16(typedef half_float::half dt_float16;) + +#define MEGDNN_PARAMETERIZED_DTYPE_ENUM_BASE 100000 +#if MEGDNN_CC_HOST + //! enumeration of dtypes; useful for hash or being used in switch-case + enum class DTypeEnum: uint32_t { +#else + struct DTypeEnum { + enum Ev { +#endif + Float32, + Uint8, + Int8, + Int16, + Int32, + IntB1, + IntB2, + IntB4, + Byte, +#if !MEGDNN_DISABLE_FLOAT16 + Float16, +#endif + UintB4 = 10, + + #define FST(_name) _name = MEGDNN_PARAMETERIZED_DTYPE_ENUM_BASE, + #define D(_name) _name, + MEGDNN_FOREACH_PARAMETERIZED_DTYPE_2(FST, D) + #undef D + #undef FST +#if !MEGDNN_CC_HOST + }; + uint32_t ev; + DTypeEnum(): ev(0) {} + DTypeEnum(uint32_t e): ev(e) {} +#endif + }; + +#if MEGDNN_CC_HOST + //! dtype numeric category fo + enum class DTypeCategory: int { + OTHER, FLOAT, INT, LOWBIT, QUANTIZED + }; + //! dtype signedness + enum class DTypeSignedness: int { + OTHER, UNSIGNED, SIGNED + }; +#else + struct DTypeCategory { + enum Ev { + OTHER, FLOAT, INT, LOWBIT, QUANTIZED + }; + int ev; + }; + struct DTypeSignedness { + enum Ev { + OTHER, UNSIGNED, SIGNED + }; + int ev; + }; +#endif + +/*! + * \brief information about a data type that can be accessed at compile time + * \tparam DTypeImpl either an implementation class (e.g. dtype::Int32), or a + * plain c type (e.g. int or dt_int32) + */ +template +struct DTypeTrait; + +// This can be specialized to define custom param structures for each +// parameterized DType, it should implement `std::size_t hash()` and +// `bool operator==(rhs).` +template +struct DTypeParamImpl; + +template +using DTypeParam = DTypeParamImpl::ctype>; + +/*! + * \brief Information about a data type that can be accessed at runtime + */ +class DType { + private: + MEGDNN_NORETURN void on_request_lowbit_size() const; + // HACK: This is required in ParameterizedDType::downcast_from + public: + MEGDNN_NORETURN void on_assert_is_failed(const char *rname) const; + protected: + struct Trait { + const char *const name; + const uint16_t size_log; //!< log2 of sizeof(dt) for non-lowbit + const uint16_t low_bit; //!< 0 for non-lowbit; otherwise num bits + DTypeEnum enumv; + DTypeCategory category; + DTypeSignedness signedness; + const bool has_param; + }; + Trait *m_trait; + + explicit DType(Trait *t): + m_trait(t) + {} + + public: + DType(): + m_trait(nullptr) + {} + + bool valid() const { + return m_trait != nullptr; + } + + /*! + * \brief name of this data type + */ + const char *name() const { + return m_trait ? m_trait->name : "invalid"; + } + + /*! + * \brief size of elem_num this data type, if fraction form return ceil + */ + size_t size(size_t elem_num) const { + if (m_trait->low_bit != 0) + return static_cast( (m_trait->low_bit*elem_num + 7)/8 ); + return elem_num << m_trait->size_log; + } + + /*! + * \brief max number of elements within representation + * + * The total size of the tensor (in bytes) should not exceed size_t range. + */ + size_t max_elements() const { + if (m_trait->low_bit != 0) + return std::numeric_limits::max(); + + return std::numeric_limits::max() >> m_trait->size_log; + } + + bool is_low_bit() const { + return m_trait->low_bit != 0; + } + + /*! + * \brief size of this data type, in bytes + */ + size_t size() const { + if (m_trait->low_bit == 0) + return 1 << m_trait->size_log; + on_request_lowbit_size(); + } + + //! size() in log2 + size_t size_log() const { + if (m_trait->low_bit == 0) + return m_trait->size_log; + on_request_lowbit_size(); + } + + //! assert this dtype is given type; throw exception on failure + void assert_is(const DType &rhs) const { + if (m_trait != rhs.m_trait) + on_assert_is_failed(rhs.name()); + } + + template + inline void assert_is_ctype() const; + + template + inline void assert_is_compatible_ctype() const; + + //! get corresponding enum value for this dtype + DTypeEnum enumv() const { + return m_trait->enumv; + } + + //! get category of this data type + DTypeCategory category() const { + return m_trait->category; + } + + //! get signedness of this data type + DTypeSignedness signedness() const { + return m_trait->signedness; + } + + bool has_param() const { + return m_trait->has_param; + } + + bool operator == (const DType &rhs) const { + return m_trait == rhs.m_trait; + } + + bool operator != (const DType &rhs) const { + return m_trait != rhs.m_trait; + } + + //! get dtype object from enum + static DType from_enum(DTypeEnum ev); + + //! get a handle of the dtype that could be used for equivalence check + const void* handle() const { + return m_trait; + } + + template + T as() const { + return T::downcast_from(*this); + } + + template + const DTypeParam& param() const { + return as::dtype>().param(); + } +}; + +#ifdef MEGDNN_CC_HOST + +/*! + * \brief class template for parameterized DTypes + * + * You should not change this template in order to add new parameterized + * DType, instead you should add new entry to + * MEGDNN_FOREACH_PARAMETERIZED_DTYPE_OTHERS, follow the compile error, then add + * new specialization of DTypeParam at the end of this file. + */ +template +class ParameterizedDType MEGDNN_FINAL : public DType { + using SelfType = ParameterizedDType; + + struct Trait : DType::Trait { + DTypeParam param; + + Trait(const DType::Trait& static_trait, + const DTypeParam& param) + : DType::Trait(static_trait), param(param) {} + }; + + // static part of the trait + static DType::Trait sm_trait; + + static Trait* make_from_param(const DTypeParam& param); + explicit ParameterizedDType(DType dtype) : DType(dtype) {} + +public: + template + explicit ParameterizedDType(Args&&... args) + : DType(make_from_param({std::forward(args)...})) {} + +/** + * static member \c sm_trait is been used, the compiler wil trigger + * warnings if it hasn't an explicit instantiation declaration with include dir + * using \c -I; while build by bazel, include dir is traited as system headers, + * using \c -isystem, and the warnings is supressed. + * + * Here we just supressed the warning, as it will explicit instantiation in + * \c dtype.cpp. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic ignored "-Wundefined-var-template" + static SelfType downcast_from(DType dtype) { + if (dtype.enumv() != type_enum) { + dtype.on_assert_is_failed(sm_trait.name); + } + return ParameterizedDType(dtype); + } +#pragma GCC diagnostic pop + + const DTypeParam& param() { + return static_cast(m_trait)->param; + } +}; + +#endif // MEGDNN_CC_HOST + +//! dtype implementation classes +namespace dtype { + +#define IMPL(_name) \ + class _name MEGDNN_FINAL: public DType { \ + static Trait sm_trait; \ + public: \ + _name(): DType(&sm_trait) {} \ + }; + +MEGDNN_FOREACH_DTYPE_NAME(IMPL) +#undef IMPL + +#ifdef MEGDNN_CC_HOST +#define cb(_name) using _name = ParameterizedDType; +#else +#define cb(_name) \ + class _name MEGDNN_FINAL : public DType {}; +#endif +MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) +#undef cb + +//! log function used in DTypeTrait +template struct log { + static MEGDNN_CONSTEXPR size_t value = log<(n>>1)>::value + 1; +#if MEGDNN_CC_HOST + MEGDNN_STATIC_ASSERT( (n&(n-1)) == 0, "only full power number can have log"); +#endif +}; +template<> struct log<1> {static MEGDNN_CONSTEXPR size_t value = 0;}; + +} // namespace dtype + +// begin define DTypeTrait impls { + +#if MEGDNN_CC_HOST +#define MEGDNN_DEF_DT_BASIC_FIELDS(_name, _ctype, _cat, _sign, _bits, \ + _has_param) \ + static MEGDNN_CONSTEXPR const char *name = #_name; \ + using ctype = _ctype; \ + using dtype = ::megdnn::dtype::_name; \ + static MEGDNN_CONSTEXPR DTypeCategory category = DTypeCategory::_cat; \ + static MEGDNN_CONSTEXPR DTypeSignedness \ + signedness = DTypeSignedness::_sign; \ + static MEGDNN_CONSTEXPR uint16_t size_log = \ + ::megdnn::dtype::log::value; \ + static MEGDNN_CONSTEXPR DTypeEnum enumv = DTypeEnum::_name;\ + static MEGDNN_CONSTEXPR uint16_t low_bit = _bits;\ + static MEGDNN_CONSTEXPR bool has_param = _has_param +#else +#define MEGDNN_DEF_DT_BASIC_FIELDS(_name, _ctype, _cat, _sign, _bits, \ + _has_param) \ + typedef _ctype ctype; \ + typedef ::megdnn::dtype::_name dtype; \ + static const uint16_t size_log = \ + ::megdnn::dtype::log::value; \ + static MEGDNN_CONSTEXPR int enumv = DTypeEnum::_name;\ + static MEGDNN_CONSTEXPR uint16_t low_bit = _bits +#endif // MEGDNN_CC_HOST + +#define MEGDNN_DEF_DT(_name, _ctype, _cat, _sign, _minval, _maxval) \ + template <> \ + struct DTypeTrait { \ + MEGDNN_DEF_DT_BASIC_FIELDS(_name, _ctype, _cat, _sign, 0, false); \ + MEGDNN_HOST MEGDNN_DEVICE static ctype min() { \ + return _minval; \ + } \ + MEGDNN_HOST MEGDNN_DEVICE static ctype max() { \ + return _maxval; \ + } \ + } + +MEGDNN_DEF_DT(Float32, dt_float32, FLOAT, SIGNED, -FLT_MAX, FLT_MAX); +MEGDNN_DEF_DT(Int32, dt_int32, INT, SIGNED, INT32_MIN, INT32_MAX); +MEGDNN_DEF_DT(Int16, dt_int16, INT, SIGNED, INT16_MIN, INT16_MAX); +MEGDNN_DEF_DT(Int8, dt_int8, INT, SIGNED, INT8_MIN, INT8_MAX); +MEGDNN_DEF_DT(Uint8, dt_uint8, INT, UNSIGNED, 0, UINT8_MAX); +MEGDNN_INC_FLOAT16(MEGDNN_DEF_DT(Float16, dt_float16, FLOAT, SIGNED, + std::numeric_limits::lowest(), + std::numeric_limits::max())); + +template <> +struct DTypeTrait { + MEGDNN_DEF_DT_BASIC_FIELDS(Byte, dt_byte, OTHER, OTHER, 0, false); +}; + +#define MEGDNN_DEF_FRACTION_DT(_name, b)\ + template <> \ + struct DTypeTrait {\ + MEGDNN_DEF_DT_BASIC_FIELDS(_name##b, dt_##_name##b, LOWBIT, OTHER, b, \ + false); \ + }; +MEGDNN_FOREACH_LOWBIT_DTYPE(MEGDNN_DEF_FRACTION_DT) +#undef MEGDNN_DEF_FRACTION_DT + +#define MEGDNN_DEF_PARAMETERIZED_DT(_name, _ctype, _itype, _cat, _sign, \ + _minval, _maxval, _bits) \ + template <> \ + struct DTypeTrait { \ + MEGDNN_DEF_DT_BASIC_FIELDS(_name, _ctype, _cat, _sign, _bits, true); \ + MEGDNN_HOST MEGDNN_DEVICE static _itype min() { \ + return static_cast<_itype>(_minval); \ + } \ + MEGDNN_HOST MEGDNN_DEVICE static _itype max() { \ + return static_cast<_itype>(_maxval); \ + } \ + }; + +MEGDNN_DEF_PARAMETERIZED_DT(Quantized4Asymm, dt_quint4, uint8_t, QUANTIZED, + SIGNED, 0, 15, 4); +MEGDNN_DEF_PARAMETERIZED_DT(QuantizedS4, dt_qint4, int8_t, QUANTIZED, + SIGNED, -8, 7, 4); +MEGDNN_DEF_PARAMETERIZED_DT(Quantized8Asymm, dt_quint8, dt_quint8, QUANTIZED, + SIGNED, 0, 255, 0); +MEGDNN_DEF_PARAMETERIZED_DT(QuantizedS8, dt_qint8, dt_qint8, QUANTIZED, SIGNED, + INT8_MIN, INT8_MAX, 0); +MEGDNN_DEF_PARAMETERIZED_DT(QuantizedS16, dt_qint16, dt_qint16, QUANTIZED, + SIGNED, INT16_MIN, INT16_MAX, 0); +MEGDNN_DEF_PARAMETERIZED_DT(QuantizedS32, dt_qint32, dt_qint32, QUANTIZED, + SIGNED, INT32_MIN, INT32_MAX, 0); +#undef MEGDNN_DEF_PARAMETERIZED_DT + +#undef MEGDNN_DEF_DT +#undef MEGDNN_DEF_DT_BASIC_FIELDS +// end define DTypeTrait impls } + + +// alias DTypeTrait for ctypes +#define IMPL(_obj) \ +template <> \ +struct DTypeTrait::ctype>: \ +public DTypeTrait { }; + +MEGDNN_FOREACH_DTYPE_NAME(IMPL) +MEGDNN_FOREACH_PARAMETERIZED_DTYPE(IMPL) +#undef IMPL + + +template +inline void DType::assert_is_ctype() const { + return assert_is(typename DTypeTrait::dtype()); +} + +#ifdef MEGDNN_CC_HOST + +#define INST(_dt) \ + template <> \ + inline void DType::assert_is_ctype::ctype>() \ + const { \ + if (enumv() != DTypeTrait::enumv) { \ + on_assert_is_failed(DTypeTrait::name); \ + } \ + } +MEGDNN_FOREACH_PARAMETERIZED_DTYPE(INST) +#undef INST + + +template +inline void DType::assert_is_compatible_ctype() const { + if (enumv() != DTypeTrait::enumv) { + on_assert_is_failed(DTypeTrait::name); + } +} + +#define INST(_dt, _dtype) \ + template <> \ + inline void \ + DType::assert_is_compatible_ctype::ctype>() const { \ + if (enumv() != DTypeTrait::enumv && \ + enumv() != DTypeTrait::enumv) { \ + on_assert_is_failed(DTypeTrait::name); \ + } \ + } + +INST(Int8, QuantizedS8) +INST(Uint8, Quantized8Asymm) +INST(Int16, QuantizedS16) +INST(Int32, QuantizedS32) +#undef INST + +#else + +#define INST(_dt) \ + template <> \ + inline void DType::assert_is_ctype::ctype>() \ + const { \ + if (enumv().ev != DTypeTrait::enumv) { \ + on_assert_is_failed(dtype::_dt().name()); \ + } \ + } +MEGDNN_FOREACH_PARAMETERIZED_DTYPE(INST) +#undef INST + +#endif // MEGDNN_CC_HOST + + +// begin Specialization of DTypeParamImpl for each parameterzied DType { +template <> +struct DTypeParamImpl { + float scale; + uint8_t zero_point; + + DTypeParamImpl() = default; + DTypeParamImpl(float scale, uint8_t zero_point); + +#ifdef MEGDNN_CC_HOST + std::size_t hash() const; +#endif + bool operator==(const DTypeParam& rhs) const; + + MEGDNN_DEVICE dt_quint8 quantize(float in) const { + float v = in / scale; + v = roundf(v); + v = v + zero_point; + v = fmin(fmax(0.f, v), 255.f); + return static_cast(v); + } + MEGDNN_DEVICE float dequantize(dt_quint8 in) const { + return (in.as_uint8() - zero_point) * scale; + } +}; + +template <> +struct DTypeParamImpl { + float scale; + + DTypeParamImpl() = default; + DTypeParamImpl(float scale); +#ifdef MEGDNN_CC_HOST + std::size_t hash() const; +#endif + bool operator==(const DTypeParam& rhs) const; + MEGDNN_DEVICE dt_qint8 quantize(float in) const { + float v = in / scale; + //! roundf(nan) -> nan + v = roundf(v); + //! \warning As fmax(nan, a) = a, this should match the process + //! in function saturate(), otherwise may cause precision error. + v = fmin(fmax(-128.f, v), 127.f); + return static_cast(v); + } + MEGDNN_DEVICE float dequantize(dt_qint8 in) const { + return in.as_int8() * scale; + } +}; + +template <> +struct DTypeParamImpl { + float scale; + + DTypeParamImpl() = default; + DTypeParamImpl(float scale); +#ifdef MEGDNN_CC_HOST + std::size_t hash() const; +#endif // MEGDNN_CC_HOST + bool operator==(const DTypeParam& rhs) const; + MEGDNN_DEVICE dt_qint16 quantize(float in) const { + float v = in / scale; + v = roundf(v); + //! \warning As fmax(nan, a) = a, this should match the process + //! in function saturate(), otherwise may cause precision error. + v = fmin(fmax(-32768.f, v), 32767.f); + return static_cast(v); + } + MEGDNN_DEVICE float dequantize(dt_qint16 in) const { + return in.as_int16() * scale; + } +}; + +template <> +struct DTypeParamImpl { + float scale; + + DTypeParamImpl() = default; + DTypeParamImpl(float scale); +#ifdef MEGDNN_CC_HOST + std::size_t hash() const; +#endif // MEGDNN_CC_HOST + bool operator==(const DTypeParam& rhs) const; + MEGDNN_DEVICE dt_qint32 quantize(float in) const { + float v = in / scale; + v = roundf(v); + /*! \note: the maximal signed integer that can be correctly represented + * as a single precision floating point number is 2147483520 + */ + v = fmin(fmax(-2147483648.f, v), 2147483520.f); + return static_cast(v); + } + MEGDNN_DEVICE float dequantize(dt_qint32 in) const { + return in.as_int32() * scale; + } +}; + +template <> +struct DTypeParamImpl { + float scale; + uint8_t zero_point; + + DTypeParamImpl() = default; + DTypeParamImpl(float scale, uint8_t zero_point); +#ifdef MEGDNN_CC_HOST + std::size_t hash() const; +#endif + bool operator==(const DTypeParam& rhs) const; + MEGDNN_DEVICE dt_quint4 quantize(float in) const { + float v = in / scale; + v = roundf(v); + v = v + zero_point; + v = fmin(fmax(0.f, v), 15.f); + return static_cast(v); + } + MEGDNN_DEVICE float dequantize(uint8_t in) const { + return (in - zero_point) * scale; + } + MEGDNN_DEVICE float dequantize(dt_quint4 in) const { + return (in.as_uint8() - zero_point) * scale; + } +}; + +template <> +struct DTypeParamImpl { + float scale; + + DTypeParamImpl() = default; + DTypeParamImpl(float scale); +#ifdef MEGDNN_CC_HOST + std::size_t hash() const; +#endif + bool operator==(const DTypeParam& rhs) const; + MEGDNN_DEVICE dt_qint4 quantize(float in) const { + float v = in / scale; + v = roundf(v); + v = fmin(fmax(-8.f, v), 7.f); + return static_cast(v); + } + MEGDNN_DEVICE float dequantize(int8_t in) const { + return in * scale; + } + MEGDNN_DEVICE float dequantize(dt_qint4 in) const { + return in.as_int8() * scale; + } +}; + +// end Specialization of DTypeParamImpl for each parameterzied DType } + +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/include/megdnn/dtype/half.hpp b/dnn/include/megdnn/dtype/half.hpp new file mode 100644 index 00000000..1621d7bc --- /dev/null +++ b/dnn/include/megdnn/dtype/half.hpp @@ -0,0 +1,3156 @@ +/** + * half - IEEE 754-based half-precision floating point library. + * + * Copyright (c) 2012-2013 Christian Rau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Version 1.11.0 + * \file + * Main header file for half precision functionality. + * + * -------------------------------------------------------------------------- + * \file dnn/include/megdnn/dtype/half.hpp + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * -------------------------------------------------------------------------- + */ + +#ifndef HALF_HALF_HPP +#define HALF_HALF_HPP +#include "megdnn/arch.h" +#if defined(__CUDACC__) && !defined(__HIPCC__) +#define CUDA_NO_HALF +#include +#endif +#if defined(__HIPCC__) && !defined(__CUDACC__) +#define HIP_NO_HALF +#define __CUDA_ARCH__ __HIP_DEVICE_COMPILE__ +#define __CUDACC_VER_MAJOR__ 9 +#include +#endif + +/// Combined gcc version number. +#define HALF_GNUC_VERSION (__GNUC__*100+__GNUC_MINOR__) + +//check C++11 language features +#if defined(__clang__) //clang + #if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) + #define HALF_ENABLE_CPP11_USER_LITERALS 1 + #endif + #if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif +/*#elif defined(__INTEL_COMPILER) //Intel C++ + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif*/ +#elif defined(__GNUC__) //gcc + #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) + #define HALF_ENABLE_CPP11_USER_LITERALS 1 + #endif + #if !defined(HALF_ENABLE_CPP11_LONG_LONG) + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif + #endif +#elif defined(_MSC_VER) //Visual C++ + #if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif + #define HALF_POP_WARNINGS 1 + #pragma warning(push) + //! 4521 and 4522 is multiple copy/assigment operator specified + #pragma warning(disable : 4099 4127 4146 4521 4522) //struct vs class, constant in if, negative unsigned +#endif + +//check C++11 library features +#include +#if defined(_LIBCPP_VERSION) //libc++ + #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 + #ifndef HALF_ENABLE_CPP11_TYPE_TRAITS + #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 + #endif + #ifndef HALF_ENABLE_CPP11_CSTDINT + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #ifndef HALF_ENABLE_CPP11_CMATH + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #ifndef HALF_ENABLE_CPP11_HASH + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #endif +#elif defined(__GLIBCXX__) //libstdc++ + #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 + #ifdef __clang__ + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) + #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 + #endif + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #else + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #endif + #endif +#elif defined(_CPPLIB_VER) //Dinkumware/Visual C++ + #if _CPPLIB_VER >= 520 + #ifndef HALF_ENABLE_CPP11_TYPE_TRAITS + #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 + #endif + #ifndef HALF_ENABLE_CPP11_CSTDINT + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #ifndef HALF_ENABLE_CPP11_HASH + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #endif + #if _CPPLIB_VER >= 610 + #ifndef HALF_ENABLE_CPP11_CMATH + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #endif +#endif +#undef HALF_GNUC_VERSION + +//support constexpr +#if HALF_ENABLE_CPP11_CONSTEXPR + #define HALF_CONSTEXPR constexpr + #define HALF_CONSTEXPR_CONST constexpr +#else + #define HALF_CONSTEXPR + #define HALF_CONSTEXPR_CONST const +#endif + +//support noexcept +#if HALF_ENABLE_CPP11_NOEXCEPT + #define HALF_NOEXCEPT noexcept + #define HALF_NOTHROW noexcept +#else + #define HALF_NOEXCEPT + #define HALF_NOTHROW throw() +#endif + +#include +#include +#include +#include +#include +#if HALF_ENABLE_CPP11_TYPE_TRAITS + #include +#endif +#if HALF_ENABLE_CPP11_CSTDINT + #include +#endif +#if HALF_ENABLE_CPP11_HASH + #include +#endif + + +/// Default rounding mode. +/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as well as +/// for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including half.hpp) to one +/// of the standard rounding modes using their respective constants or the equivalent values of `float_round_style`: +/// +/// `float_round_style` | value | rounding +/// ---------------------------------|-------|------------------------- +/// `round_indeterminate` | -1 | fastest (default) +/// `round_toward_zero` | 0 | toward zero +/// `round_to_nearest` | 1 | to nearest +/// `round_toward_infinity` | 2 | toward positive infinity +/// `round_toward_neg_infinity` | 3 | toward negative infinity +/// +/// By default this is set to `-1` (`round_indeterminate`), which uses truncation (round toward zero, but with overflows +/// set to infinity) and is the fastest rounding mode possible. It can even be set to `numeric_limits::round_style` +/// to synchronize the rounding mode with that of the underlying single-precision implementation. +#ifndef HALF_ROUND_STYLE + #define HALF_ROUND_STYLE 1 // = to nearest +#endif + +/// Tie-breaking behaviour for round to nearest. +/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this is +/// defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way cases (and +/// thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more IEEE-conformant +/// behaviour is needed. +#ifndef HALF_ROUND_TIES_TO_EVEN + #define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero +#endif + +/// Value signaling overflow. +/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow of an +/// operation, in particular it just evaluates to positive infinity. +#define HUGE_VALH numeric_limits::infinity() + +/// Fast half-precision fma function. +/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate +/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all +/// arithmetic operations, this is in fact always the case. +#define FP_FAST_FMAH 1 + +#ifndef FP_ILOGB0 + #define FP_ILOGB0 INT_MIN +#endif +#ifndef FP_ILOGBNAN + #define FP_ILOGBNAN INT_MAX +#endif +#ifndef FP_SUBNORMAL + #define FP_SUBNORMAL 0 +#endif +#ifndef FP_ZERO + #define FP_ZERO 1 +#endif +#ifndef FP_NAN + #define FP_NAN 2 +#endif +#ifndef FP_INFINITE + #define FP_INFINITE 3 +#endif +#ifndef FP_NORMAL + #define FP_NORMAL 4 +#endif + + +/// Main namespace for half precision functionality. +/// This namespace contains all the functionality provided by the library. +namespace half_float +{ + class half; +#ifdef MEGDNN_CC_CUDA + typedef __half cuhalf; + inline MEGDNN_DEVICE cuhalf uint162cuhalf(unsigned short x) + { +#if __CUDACC_VER_MAJOR__ >= 9 + return __ushort_as_half(x); +#else + cuhalf res; + res.x = x; + return res; +#endif + } + inline MEGDNN_DEVICE unsigned short cuhalf2uint16(cuhalf x) + { +#if __CUDACC_VER_MAJOR__ >= 9 + return __half_as_ushort(x); +#else + return x.x; +#endif + } +#endif + + /// \internal + /// \brief Implementation details. + namespace detail + { + #if HALF_ENABLE_CPP11_TYPE_TRAITS + /// Conditional type. + template struct conditional : std::conditional {}; + + /// Helper for tag dispatching. + template struct bool_type : std::integral_constant {}; + using std::true_type; + using std::false_type; + + /// Type traits for floating point types. + template struct is_float : std::is_floating_point {}; + #else + /// Conditional type. + template struct conditional { typedef T type; }; + template struct conditional { typedef F type; }; + + /// Helper for tag dispatching. + template struct bool_type {}; + typedef bool_type true_type; + typedef bool_type false_type; + + /// Type traits for floating point types. + template struct is_float : false_type {}; + template struct is_float : is_float {}; + template struct is_float : is_float {}; + template struct is_float : is_float {}; + template<> struct is_float : true_type {}; + template<> struct is_float : true_type {}; + template<> struct is_float : true_type {}; + #endif + + #if HALF_ENABLE_CPP11_CSTDINT + /// Unsigned integer of (at least) 16 bits width. + typedef uint_least16_t uint16; + + /// Unsigned integer of (at least) 32 bits width. + typedef uint_least32_t uint32; + + /// Fastest signed integer capable of holding all values of type uint16. + typedef int_fast32_t int17; + #else + /// Unsigned integer of (at least) 16 bits width. + typedef unsigned short uint16; + + /// Unsigned integer of (at least) 32 bits width. + typedef conditional::digits>=32,unsigned int,unsigned long>::type uint32; + + /// Fastest signed integer capable of holding all values of type uint16. + typedef conditional::digits>=16,int,long>::type int17; + #endif + + /// Tag type for binary_t() construction. + struct binary_t {}; + + + /// Temporary half-precision expression. + /// This class represents a half-precision expression which just stores a single-precision value internally. + struct expr + { + /// Conversion constructor. + /// \param f single-precision value to convert + MEGDNN_HOST MEGDNN_DEVICE explicit HALF_CONSTEXPR expr(float f) : value_(f) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + MEGDNN_HOST MEGDNN_DEVICE HALF_CONSTEXPR operator float() const { return value_; } + + private: + /// Internal expression value stored in single-precision. + float value_; + }; + + /// SFINAE helper for generic half-precision functions. + /// This class template has to be specialized for each valid combination of argument types to provide a corresponding + /// `type` member equivalent to \a T. + /// \tparam T type to return + template struct enable {}; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + + /// Return type for specialized generic 2-argument half-precision functions. + /// This class template has to be specialized for each valid combination of argument types to provide a corresponding + /// `type` member denoting the appropriate return type. + /// \tparam T first argument type + /// \tparam U first argument type + template struct result : enable {}; + template<> struct result { typedef half type; }; + + /// \name Classification helpers + /// \{ + + /// Check for infinity. + /// \tparam T argument type (builtin floating point type) + /// \param arg value to query + /// \retval true if infinity + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE bool builtin_isinf(T arg) + { + #if defined(__CUDA_ARCH__) + return ::isinf(arg); + #elif HALF_ENABLE_CPP11_CMATH + return ::std::isinf(arg); + #elif defined(_MSC_VER) + return !_finite(static_cast(arg)) && !_isnan(static_cast(arg)); + #else + return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); + #endif + } + + /// Check for NaN. + /// \tparam T argument type (builtin floating point type) + /// \param arg value to query + /// \retval true if not a number + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE bool builtin_isnan(T arg) + { + #if defined(__CUDA_ARCH__) + return ::isnan(arg); + #elif HALF_ENABLE_CPP11_CMATH + return std::isnan(arg); + #elif defined(_MSC_VER) + return _isnan(static_cast(arg)) != 0; + #else + return arg != arg; + #endif + } + + /// Check sign. + /// \tparam T argument type (builtin floating point type) + /// \param arg value to query + /// \retval true if signbit set + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE bool builtin_signbit(T arg) + { + #if defined(__CUDA_ARCH__) + return ::signbit(arg); + #elif HALF_ENABLE_CPP11_CMATH + return std::signbit(arg); + #else + return arg < T() || (arg == T() && T(1)/arg < T()); + #endif + } + + /// \} + /// \name Conversion + /// \{ + + /// Convert IEEE single-precision to half-precision. + /// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \param value single-precision value + /// \return binary_t() representation of half-precision value + template MEGDNN_HOST MEGDNN_DEVICE uint16 float2half_impl(float value, true_type) + { +#if defined(__CUDA_ARCH__) +#if __CUDACC_VER_MAJOR__ >= 9 +#if defined(__HIPCC__) && !defined(__CUDACC__) + return static_cast<__half_raw>(__float2half(value)).x; +#else + return __half_as_ushort(__float2half(value)); +#endif +#else + return __float2half(value).x; +#endif +#else + #if HALF_ENABLE_CPP11_STATIC_ASSERT + static_assert(std::numeric_limits::is_iec559, "float to half conversion needs IEEE 754 conformant 'float' type"); + static_assert(sizeof(uint32)==sizeof(float), "float to half conversion needs unsigned integer type of exactly the size of a 'float'"); + #endif + static const uint16 base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, + 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, + 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, + 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, + 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; + static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; + uint32 bits;// = *reinterpret_cast(&value); //violating strict aliasing! + memcpy(&bits, &value, sizeof(float)); + uint16 hbits = base_table[bits>>23] + static_cast((bits&0x7FFFFF)>>shift_table[bits>>23]); + if(R == std::round_to_nearest) + hbits += (((bits&0x7FFFFF)>>(shift_table[bits>>23]-1))|(((bits>>23)&0xFF)==102)) & ((hbits&0x7C00)!=0x7C00) + #if HALF_ROUND_TIES_TO_EVEN + & (((((static_cast(1)<<(shift_table[bits>>23]-1))-1)&bits)!=0)|hbits) + #endif + ; + else if(R == std::round_toward_zero) + hbits -= ((hbits&0x7FFF)==0x7C00) & ~shift_table[bits>>23]; + else if(R == std::round_toward_infinity) + hbits += ((((bits&0x7FFFFF&((static_cast(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=102)& + ((bits>>23)!=0)))&(hbits<0x7C00)) - ((hbits==0xFC00)&((bits>>23)!=511)); + else if(R == std::round_toward_neg_infinity) + hbits += ((((bits&0x7FFFFF&((static_cast(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=358)& + ((bits>>23)!=256)))&(hbits<0xFC00)&(hbits>>15)) - ((hbits==0x7C00)&((bits>>23)!=255)); + return hbits; +#endif + } + + /// Convert non-IEEE single-precision to half-precision. + /// \param value single-precision value + /// \return binary_t() representation of half-precision value + template MEGDNN_HOST uint16 float2half_impl(float value, false_type) + { + uint16 hbits = builtin_signbit(value) << 15; + if(value == 0.0f) + return hbits; + if(builtin_isnan(value)) + return hbits | 0x7FFF; + if(builtin_isinf(value)) + return hbits | 0x7C00; + int exp; + frexp(value, &exp); + if(exp > 16) + { + if(R == std::round_toward_zero) + return hbits | 0x7BFF; + else if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + else if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7C00; + } + if(exp < -13) + value = ldexp(value, 24); + else + { + value = ldexp(value, 11-exp); + hbits |= ((exp+14)<<10); + } + int ival = static_cast(value); + hbits |= static_cast(abs(ival)&0x3FF); + if(R == std::round_to_nearest) + { + float diff = std::abs(value-static_cast(ival)); + #if HALF_ROUND_TIES_TO_EVEN + hbits += (diff>0.5f) | ((diff==0.5f)&hbits); + #else + hbits += diff >= 0.5f; + #endif + } + else if(R == std::round_toward_infinity) + hbits += value > static_cast(ival); + else if(R == std::round_toward_neg_infinity) + hbits += value < static_cast(ival); + return hbits; + } + + /// Convert single-precision to half-precision. + /// \param value single-precision value + /// \return binary_t() representation of half-precision value + template MEGDNN_HOST MEGDNN_DEVICE uint16 float2half(float value) + { +#if defined(__CUDA_ARCH__) + return float2half_impl(value, true_type()); +#else + return float2half_impl(value, bool_type::is_iec559&&sizeof(uint32)==sizeof(float)>()); +#endif + } + + /// Convert integer to half-precision floating point. + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \tparam S `true` if value negative, `false` else + /// \tparam T type to convert (builtin integer type) + /// \param value non-negative integral value + /// \return binary_t() representation of half-precision value + template MEGDNN_HOST MEGDNN_DEVICE uint16 int2half_impl(T value) + { + if(S) + value = -value; + uint16 bits = S << 15; + if(value > 65504) + { + if(R == std::round_toward_infinity) + bits |= 0x7C00 - S; + else if(R == std::round_toward_neg_infinity) + bits |= 0x7BFF + S; + else + bits |= 0x7BFF + (R!=std::round_toward_zero); + } + else if(value) + { + unsigned int m = value, exp = 25; + for(; m<0x400; m<<=1,--exp) ; + for(; m>0x7FF; m>>=1,++exp) ; + bits |= (exp<<10) | (m&0x3FF); + if(exp > 25) + { + if(R == std::round_to_nearest) + bits += (value>>(exp-26)) & 1 + #if HALF_ROUND_TIES_TO_EVEN + & (((((1<<(exp-26))-1)&value)!=0)|bits) + #endif + ; + else if(R == std::round_toward_infinity) + bits += ((value&((1<<(exp-25))-1))!=0) & !S; + else if(R == std::round_toward_neg_infinity) + bits += ((value&((1<<(exp-25))-1))!=0) & S; + } + } + return bits; + } + + /// Convert integer to half-precision floating point. + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \tparam T type to convert (builtin integer type) + /// \param value integral value + /// \return binary_t() representation of half-precision value + template MEGDNN_HOST MEGDNN_DEVICE uint16 int2half(T value) + { + return (value<0) ? int2half_impl(value) : int2half_impl(value); + } + + /// Convert half-precision to IEEE single-precision. + /// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). + /// \param value binary_t() representation of half-precision value + /// \return single-precision value + MEGDNN_HOST MEGDNN_DEVICE inline float half2float_impl(uint16 value, true_type) + { +#if __CUDA_ARCH__ +#if __CUDACC_VER_MAJOR__ >= 9 +#if defined(__HIPCC__) && !defined(__CUDACC__) + __half_raw r; + r.x = value; + return __half2float(r); +#else + return __half2float(__ushort_as_half(value)); +#endif +#else + return __half2float(value); +#endif +#else + #if HALF_ENABLE_CPP11_STATIC_ASSERT + static_assert(std::numeric_limits::is_iec559, "half to float conversion needs IEEE 754 conformant 'float' type"); + static_assert(sizeof(uint32)==sizeof(float), "half to float conversion needs unsigned integer type of exactly the size of a 'float'"); + #endif + static const uint32 mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, + 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, + 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, + 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, + 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, + 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, + 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, + 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, + 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, + 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, + 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, + 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, + 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, + 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, + 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, + 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, + 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, + 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, + 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, + 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, + 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, + 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, + 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, + 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, + 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, + 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, + 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, + 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, + 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, + 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, + 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, + 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, + 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, + 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, + 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, + 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, + 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, + 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, + 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, + 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, + 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, + 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, + 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, + 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, + 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, + 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, + 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, + 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, + 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, + 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, + 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, + 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, + 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, + 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, + 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, + 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, + 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, + 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, + 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, + 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, + 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, + 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, + 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, + 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, + 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, + 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, + 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, + 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, + 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, + 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, + 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, + 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, + 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, + 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, + 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, + 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, + 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, + 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, + 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, + 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, + 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, + 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, + 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, + 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, + 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, + 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; + static const uint32 exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, + 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, + 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; + static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; + uint32 bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; +// uint32 bits = mantissa_table[(((value&0x7C00)!=0)<<10)+(value&0x3FF)] + exponent_table[value>>10]; +// return *reinterpret_cast(&bits); //violating strict aliasing! + float out; + memcpy(&out, &bits, sizeof(float)); + return out; +#endif + } + + /// Convert half-precision to non-IEEE single-precision. + /// \param value binary_t() representation of half-precision value + /// \return single-precision value + MEGDNN_HOST MEGDNN_DEVICE inline float half2float_impl(uint16 value, false_type) + { +#ifdef __CUDA_ARCH__ +#if __CUDACC_VER_MAJOR__ >= 9 +#if defined(__HIPCC__) && !defined(__CUDACC__) + __half_raw r; + r.x = value; + return __half2float(r); +#else + return __half2float(__ushort_as_half(value)); +#endif +#else + return __half2float(value); +#endif +#else + float out; + int abs = value & 0x7FFF; + if(abs > 0x7C00) + out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : 0.0f; + else if(abs == 0x7C00) + out = std::numeric_limits::has_infinity ? + std::numeric_limits::infinity() : + std::numeric_limits::max(); + else if(abs > 0x3FF) + out = ldexpf(static_cast((value&0x3FF)|0x400), (abs>>10)-25); + else + out = ldexpf(static_cast(abs), -24); + return (value&0x8000) ? -out : out; +#endif + } + + /// Convert half-precision to single-precision. + /// \param value binary_t() representation of half-precision value + /// \return single-precision value + MEGDNN_HOST MEGDNN_DEVICE inline float half2float(uint16 value) + { +#ifdef __CUDA_ARCH__ + return half2float_impl(value, true_type()); +#else + return half2float_impl(value, bool_type::is_iec559&&sizeof(uint32)==sizeof(float)>()); +#endif + } + + /// Convert half-precision floating point to integer. + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \tparam E `true` for round to even, `false` for round away from zero + /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) + /// \param value binary_t() representation of half-precision value + /// \return integral value + template MEGDNN_HOST MEGDNN_DEVICE T half2int_impl(uint16 value) + { +#if defined(__CUDA_ARCH__) + return T(__half2float(uint162cuhalf(value))); +#else + unsigned int e = value & 0x7FFF; + if(e >= 0x7C00) + return (value&0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); + if(e < 0x3800) + { + if(R == std::round_toward_infinity) + return T(~(value>>15)&(e!=0)); + else if(R == std::round_toward_neg_infinity) + return -T(value>0x8000); + return T(); + } + int17 m = (value&0x3FF) | 0x400; + e >>= 10; + if(e < 25) + { + if(R == std::round_indeterminate || R == std::round_toward_zero) + m >>= 25 - e; + else + { + if(R == std::round_to_nearest) + m += (1<<(24-e)) - (~(m>>(25-e))&E); + else if(R == std::round_toward_infinity) + m += ((value>>15)-1) & ((1<<(25-e))-1U); + else if(R == std::round_toward_neg_infinity) + m += -(value>>15) & ((1<<(25-e))-1U); + m >>= 25 - e; + } + } + else + m <<= e - 25; +// if(numeric_limits::digits < 16) +// return min(max(m, static_cast(numeric_limits::min())), static_cast(numeric_limits::max())); + return static_cast((value&0x8000) ? -m : m); +#endif + } + + /// Convert half-precision floating point to integer. + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) + /// \param value binary_t() representation of half-precision value + /// \return integral value + template MEGDNN_HOST MEGDNN_DEVICE T half2int(uint16 value) { return half2int_impl(value); } + + /// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. + /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) + /// \param value binary_t() representation of half-precision value + /// \return integral value + template MEGDNN_HOST MEGDNN_DEVICE T half2int_up(uint16 value) { return half2int_impl(value); } + + /// Round half-precision number to nearest integer value. + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \tparam E `true` for round to even, `false` for round away from zero + /// \param value binary_t() representation of half-precision value + /// \return half-precision bits for nearest integral value + template MEGDNN_HOST MEGDNN_DEVICE uint16 round_half_impl(uint16 value) + { + unsigned int e = value & 0x7FFF; + uint16 result = value; + if(e < 0x3C00) + { + result &= 0x8000; + if(R == std::round_to_nearest) + result |= 0x3C00U & -(e>=(0x3800+E)); + else if(R == std::round_toward_infinity) + result |= 0x3C00U & -(~(value>>15)&(e!=0)); + else if(R == std::round_toward_neg_infinity) + result |= 0x3C00U & -(value>0x8000); + } + else if(e < 0x6400) + { + e = 25 - (e>>10); + unsigned int mask = (1<>e)&E); + else if(R == std::round_toward_infinity) + result += mask & ((value>>15)-1); + else if(R == std::round_toward_neg_infinity) + result += mask & -(value>>15); + result &= ~mask; + } + return result; + } + + /// Round half-precision number to nearest integer value. + /// \tparam R rounding mode to use, `round_indeterminate` for fastest rounding + /// \param value binary_t() representation of half-precision value + /// \return half-precision bits for nearest integral value + template MEGDNN_HOST MEGDNN_DEVICE uint16 round_half(uint16 value) { return round_half_impl(value); } + + /// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. + /// \param value binary_t() representation of half-precision value + /// \return half-precision bits for nearest integral value + MEGDNN_HOST MEGDNN_DEVICE inline uint16 round_half_up(uint16 value) { return round_half_impl(value); } + /// \} + + struct functions; + template struct unary_specialized; + template struct binary_specialized; + template struct half_caster; + } + + /// Half-precision floating point type. + /// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and + /// conversions. It is implicitly convertible to single-precision floating point, which makes arithmetic expressions and + /// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations + /// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to + /// half-precision are done using truncation (round towards zero), but temporary results inside chained arithmetic + /// expressions are kept in single-precision as long as possible (while of course still maintaining a strong half-precision type). + /// + /// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and + /// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which + /// means it can be standard-conformantly copied using raw binary_t() copies. But in this context some more words about the + /// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not necessarily have to be of + /// exactly 16-bits size. But on any reasonable implementation the actual binary_t() representation of this type will most + /// probably not ivolve any additional "magic" or padding beyond the simple binary_t() representation of the underlying 16-bit + /// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if + /// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on + /// nearly any reasonable platform. + /// + /// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable + /// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. + class half + { + friend struct detail::functions; + friend struct detail::unary_specialized; + friend struct detail::binary_specialized; + template friend struct detail::half_caster; + #if HALF_ENABLE_CPP11_HASH + friend struct std::hash; + #endif + + public: + /// Default constructor. + /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics + /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. + MEGDNN_HOST MEGDNN_DEVICE half() {} + + /// Copy constructor. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + MEGDNN_HOST MEGDNN_DEVICE half(detail::expr rhs) : data_(detail::float2half(rhs)) {} + + MEGDNN_HOST MEGDNN_DEVICE HALF_CONSTEXPR half(const half &rhs): + data_(rhs.data_) + { } + + MEGDNN_HOST MEGDNN_DEVICE half(const volatile half &rhs): + data_(rhs.data_) + { } + + MEGDNN_HOST MEGDNN_DEVICE half &operator=(const half &rhs) { + data_ = rhs.data_; + return *this; + } + + MEGDNN_HOST MEGDNN_DEVICE half &operator=(const volatile half &rhs) { + data_ = rhs.data_; + return *this; + } + + MEGDNN_HOST MEGDNN_DEVICE volatile half &operator=(const half &rhs) volatile { + data_ = rhs.data_; + return *this; + } + + /// Conversion constructor. + /// \param rhs float to convert + MEGDNN_HOST MEGDNN_DEVICE explicit half(float rhs) : data_(detail::float2half(rhs)) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + MEGDNN_HOST MEGDNN_DEVICE operator float() const { return detail::half2float(data_); } + + /// Assignment operator. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + /// \return reference to this half + MEGDNN_HOST MEGDNN_DEVICE half& operator=(detail::expr rhs) { return *this = static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to add + /// \return reference to this half + template MEGDNN_HOST MEGDNN_DEVICE typename detail::enable::type operator+=(T rhs) { return *this += static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to subtract + /// \return reference to this half + template MEGDNN_HOST MEGDNN_DEVICE typename detail::enable::type operator-=(T rhs) { return *this -= static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to multiply with + /// \return reference to this half + template MEGDNN_HOST MEGDNN_DEVICE typename detail::enable::type operator*=(T rhs) { return *this *= static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to divide by + /// \return reference to this half + template MEGDNN_HOST MEGDNN_DEVICE typename detail::enable::type operator/=(T rhs) { return *this /= static_cast(rhs); } + + /// Assignment operator. + /// \param rhs single-precision value to copy from + /// \return reference to this half + MEGDNN_HOST MEGDNN_DEVICE half& operator=(float rhs) { data_ = detail::float2half(rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to add + /// \return reference to this half + MEGDNN_HOST MEGDNN_DEVICE half& operator+=(float rhs) { data_ = detail::float2half(detail::half2float(data_)+rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to subtract + /// \return reference to this half + MEGDNN_HOST MEGDNN_DEVICE half& operator-=(float rhs) { data_ = detail::float2half(detail::half2float(data_)-rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to multiply with + /// \return reference to this half + MEGDNN_HOST MEGDNN_DEVICE half& operator*=(float rhs) { data_ = detail::float2half(detail::half2float(data_)*rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to divide by + /// \return reference to this half + MEGDNN_HOST MEGDNN_DEVICE half& operator/=(float rhs) { data_ = detail::float2half(detail::half2float(data_)/rhs); return *this; } + + /// Prefix increment. + /// \return incremented half value + MEGDNN_HOST MEGDNN_DEVICE half& operator++() { return *this += 1.0f; } + + /// Prefix decrement. + /// \return decremented half value + MEGDNN_HOST MEGDNN_DEVICE half& operator--() { return *this -= 1.0f; } + + /// Postfix increment. + /// \return non-incremented half value + MEGDNN_HOST MEGDNN_DEVICE half operator++(int) { half out(*this); ++*this; return out; } + + /// Postfix decrement. + /// \return non-decremented half value + MEGDNN_HOST MEGDNN_DEVICE half operator--(int) { half out(*this); --*this; return out; } + + /// Constructor. + /// \param bits binary_t() representation to set half to + MEGDNN_HOST MEGDNN_DEVICE HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) : data_(bits) {} + + /// Rounding mode to use (always `round_indeterminate`) + static HALF_CONSTEXPR_CONST std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); + private: + + /// Internal binary_t() representation + detail::uint16 data_; + }; + +#if HALF_ENABLE_CPP11_USER_LITERALS + /// Library-defined half-precision literals. + /// Import this namespace to enable half-precision floating point literals: + /// ~~~~{.cpp} + /// using namespace half_float::literal; + /// half_float::half = 4.2_h; + /// ~~~~ + namespace literal + { + /// Half literal. + /// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due + /// to rather involved single-to-half conversion. + /// \param value literal value + /// \return half with given value (if representable) + inline half operator "" _h(long double value) { return half(static_cast(value)); } + } +#endif + + namespace detail + { + /// Wrapper implementing unspecialized half-precision functions. + struct functions + { + /// Addition implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision sum stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr plus(float x, float y) { return expr(x+y); } + + /// Subtraction implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision difference stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr minus(float x, float y) { return expr(x-y); } + + /// Multiplication implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision product stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr multiplies(float x, float y) { return expr(x*y); } + + /// Division implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision quotient stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr divides(float x, float y) { return expr(x/y); } + + /// Output implementation. + /// \param out stream to write to + /// \param arg value to write + /// \return reference to stream + template static std::basic_ostream& write(std::basic_ostream &out, float arg) { return out << arg; } + + /// Input implementation. + /// \param in stream to read from + /// \param arg half to read into + /// \return reference to stream + template static std::basic_istream& read(std::basic_istream &in, half &arg) + { + float f; + if(in >> f) + arg = f; + return in; + } + + /// Modulo implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr fmod(float x, float y) { +#if defined(__CUDA_ARCH__) + return expr(fmodf(x, y)); +#else + return expr(std::fmod(x, y)); +#endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr remainder(float x, float y) + { +#if defined(__CUDA_ARCH__) + return expr(remainderf(x, y)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::remainder(x, y)); +#else + if(builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + float ax = fabs(x), ay = fabs(y); + if(ax >= 65536.0f || ay < ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if(ay >= 65536.0f) + return expr(x); + if(ax == ay) + return expr(builtin_signbit(x) ? -0.0f : 0.0f); + ax = fmod(ax, ay+ay); + float y2 = 0.5f * ay; + if(ax > y2) + { + ax -= ay; + if(ax >= y2) + ax -= ay; + } + return expr(builtin_signbit(x) ? -ax : ax); +#endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \param quo address to store quotient bits at + /// \return Half-precision division remainder stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr remquo(float x, float y, int *quo) + { +#if defined(__CUDA_ARCH__) + return expr(remquof(x, y, quo)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::remquo(x, y, quo)); +#else + if(builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + bool sign = builtin_signbit(x), qsign = static_cast(sign^builtin_signbit(y)); + float ax = fabs(x), ay = fabs(y); + if(ax >= 65536.0f || ay < ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if(ay >= 65536.0f) + return expr(x); + if(ax == ay) + return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); + ax = fmod(ax, 8.0f*ay); + int cquo = 0; + if(ax >= 4.0f * ay) + { + ax -= 4.0f * ay; + cquo += 4; + } + if(ax >= 2.0f * ay) + { + ax -= 2.0f * ay; + cquo += 2; + } + float y2 = 0.5f * ay; + if(ax > y2) + { + ax -= ay; + ++cquo; + if(ax >= y2) + { + ax -= ay; + ++cquo; + } + } + return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); +#endif + } + + /// Positive difference implementation. + /// \param x first operand + /// \param y second operand + /// \return Positive difference stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr fdim(float x, float y) + { +#if defined(__CUDA_ARCH__) + return expr(fdimf(x, y)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::fdim(x, y)); +#else + return expr((x<=y) ? 0.0f : (x-y)); +#endif + } + + /// Fused multiply-add implementation. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return \a x * \a y + \a z stored in single-precision + MEGDNN_HOST MEGDNN_DEVICE static expr fma(float x, float y, float z) + { +#if defined(__CUDA_ARCH__) + return expr(fmaf(x, y, z)); +#elif HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) + return expr(std::fma(x, y, z)); +#else + return expr(x*y+z); +#endif + } + + /// Get NaN. + /// \return Half-precision quiet NaN + MEGDNN_HOST MEGDNN_DEVICE static half nanh(const char*) { return half(binary_t(), 0x7FFF); } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr exp(float arg) { +#if defined(__CUDA_ARCH__) + return expr(expf(arg)); +#else + return expr(std::exp(arg)); +#endif + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr expm1(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(expm1f(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::expm1(arg)); +#else + return expr(static_cast(exp(static_cast(arg))-1.0)); +#endif + } + + /// Binary exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr exp2(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(exp2f(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::exp2(arg)); +#else + return expr(static_cast(exp(arg*0.69314718055994530941723212145818))); +#endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr log(float arg) { +#if defined(__CUDA_ARCH__) + return expr(logf(arg)); +#else + return expr(std::log(arg)); +#endif + } + + /// Common logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr log10(float arg) { +#if defined(__CUDA_ARCH__) + return expr(log10f(arg)); +#else + return expr(std::log10(arg)); +#endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr log1p(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(log1pf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::log1p(arg)); +#else + return expr(static_cast(log(1.0+arg))); +#endif + } + + /// Binary logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr log2(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(log2f(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::log2(arg)); +#else + return expr(static_cast(log(static_cast(arg))*1.4426950408889634073599246810019)); +#endif + } + + /// Square root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr sqrt(float arg) { +#if defined(__CUDA_ARCH__) + return expr(sqrtf(arg)); +#else + return expr(std::sqrt(arg)); +#endif + } + + /// Cubic root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr cbrt(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(cbrtf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::cbrt(arg)); +#else + if(builtin_isnan(arg) || builtin_isinf(arg)) + return expr(arg); + return expr(builtin_signbit(arg) ? -static_cast(pow(fabs(static_cast(arg)), 1.0/3.0)) : + static_cast(pow(static_cast(arg), 1.0/3.0))); +#endif + } + + /// Hypotenuse implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr hypot(float x, float y) + { +#if defined(__CUDA_ARCH__) + return expr(hypotf(x, y)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::hypot(x, y)); +#else + return expr((builtin_isinf(x) || builtin_isinf(y)) ? std::numeric_limits::infinity() : + static_cast(sqrt(static_cast(x)*x+static_cast(y)*y))); +#endif + } + + /// Power implementation. + /// \param base value to exponentiate + /// \param exp power to expontiate to + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr pow(float base, float exp) { +#if defined(__CUDA_ARCH__) + return expr(powf(base, exp)); +#else + return expr(std::pow(base, exp)); +#endif + } + + /// Sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr sin(float arg) { +#if defined(__CUDA_ARCH__) + return expr(sinf(arg)); +#else + return expr(std::sin(arg)); +#endif + } + + /// Cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr cos(float arg) { +#if defined(__CUDA_ARCH__) + return expr(cosf(arg)); +#else + return expr(std::cos(arg)); +#endif + } + + /// Tan implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr tan(float arg) { +#if defined(__CUDA_ARCH__) + return expr(tanf(arg)); +#else + return expr(std::tan(arg)); +#endif + } + + /// Arc sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr asin(float arg) { +#if defined(__CUDA_ARCH__) + return expr(asinf(arg)); +#else + return expr(std::asin(arg)); +#endif + } + + /// Arc cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr acos(float arg) { +#if defined(__CUDA_ARCH__) + return expr(acosf(arg)); +#else + return expr(std::acos(arg)); +#endif + } + + /// Arc tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr atan(float arg) { +#if defined(__CUDA_ARCH__) + return expr(atanf(arg)); +#else + return expr(std::atan(arg)); +#endif + } + + /// Arc tangent implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr atan2(float x, float y) { +#if defined(__CUDA_ARCH__) + return expr(atan2f(x, y)); +#else + return expr(std::atan2(x, y)); +#endif + } + + /// Hyperbolic sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr sinh(float arg) { +#if defined(__CUDA_ARCH__) + return expr(sinhf(arg)); +#else + return expr(std::sinh(arg)); +#endif + } + + /// Hyperbolic cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr cosh(float arg) { +#if defined(__CUDA_ARCH__) + return expr(coshf(arg)); +#else + return expr(std::cosh(arg)); +#endif + } + + /// Hyperbolic tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr tanh(float arg) { +#if defined(__CUDA_ARCH__) + return expr(tanhf(arg)); +#else + return expr(std::tanh(arg)); +#endif + } + + /// Hyperbolic area sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr asinh(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(asinhf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::asinh(arg)); +#else + return expr((arg==-std::numeric_limits::infinity()) ? arg : static_cast(log(arg+sqrt(arg*arg+1.0)))); +#endif + } + + /// Hyperbolic area cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr acosh(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(acoshf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::acosh(arg)); +#else + return expr((arg<-1.0f) ? std::numeric_limits::quiet_NaN() : static_cast(log(arg+sqrt(arg*arg-1.0)))); +#endif + } + + /// Hyperbolic area tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr atanh(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(atanhf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::atanh(arg)); +#else + return expr(static_cast(0.5*log((1.0+arg)/(1.0-arg)))); +#endif + } + + /// Error function implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr erf(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(erff(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::erf(arg)); +#else + return expr(static_cast(erf(static_cast(arg)))); +#endif + } + + /// Complementary implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr erfc(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(erfcf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::erfc(arg)); +#else + return expr(static_cast(1.0-erf(static_cast(arg)))); +#endif + } + + /// Gamma logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr lgamma(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(lgammaf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::lgamma(arg)); +#else + if(builtin_isinf(arg)) + return expr(std::numeric_limits::infinity()); + double z = static_cast(arg); + if(z < 0) + { + double i, f = ::std::modf(-z, &i); + if(f == 0.0) + return expr(std::numeric_limits::infinity()); + return expr(static_cast(1.1447298858494001741434273513531-log(abs(sin(3.1415926535897932384626433832795*f)))-lgamma(1.0-z))); + } +// if(z < 8.0) + return expr(static_cast(lgamma(static_cast(arg)))); + // return expr(static_cast(0.5*(1.8378770664093454835606594728112-log(z))+z*(log(z+1.0/(12.0*z-1.0/(10.0*z)-1.0))-1.0))); +#endif + } + + /// Gamma implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + MEGDNN_HOST MEGDNN_DEVICE static expr tgamma(float arg) + { +#if defined(__CUDA_ARCH__) + return expr(tgammaf(arg)); +#elif HALF_ENABLE_CPP11_CMATH + return expr(std::tgamma(arg)); +#else + double z = static_cast(arg); + if(z == 0.0) + return builtin_signbit(z) ? expr(-std::numeric_limits::infinity()) : expr(std::numeric_limits::infinity()); + if(z < 0.0) + { + double i, f = ::std::modf(-z, &i); + if(f == 0.0) + return expr(std::numeric_limits::quiet_NaN()); + double sign = (fmod(i, 2.0)==0.0) ? -1.0 : 1.0; + return expr(static_cast(sign*3.1415926535897932384626433832795/(sin(3.1415926535897932384626433832795*f)*exp(lgamma(1.0-z))))); + } + if(builtin_isinf(arg)) + return expr(arg); +// if(arg < 8.0f) + return expr(static_cast(exp(lgamma(z)))); + // return expr(static_cast(sqrt(6.283185307179586476925286766559/z)*pow(0.36787944117144232159552377016146*(z+1.0/(12.0*z-1.0/(10.0*z))), z))); +#endif + } + + /// Floor implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static half floor(half arg) { return half(binary_t(), round_half(arg.data_)); } + + /// Ceiling implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static half ceil(half arg) { return half(binary_t(), round_half(arg.data_)); } + + /// Truncation implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static half trunc(half arg) { return half(binary_t(), round_half(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static half round(half arg) { return half(binary_t(), round_half_up(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static long lround(half arg) { return detail::half2int_up(arg.data_); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static half rint(half arg) { return half(binary_t(), round_half(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static long lrint(half arg) { return detail::half2int(arg.data_); } + + #if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static long long llround(half arg) { return detail::half2int_up(arg.data_); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + MEGDNN_HOST MEGDNN_DEVICE static long long llrint(half arg) { return detail::half2int(arg.data_); } + #endif + + /// Decompression implementation. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return normalized significant + MEGDNN_HOST MEGDNN_DEVICE static half frexp(half arg, int *exp) + { + unsigned int m = arg.data_ & 0x7FFF; + if(m >= 0x7C00 || !m) + return *exp = 0, arg; + int e = m >> 10; + if(!e) + for(m<<=1; m<0x400; m<<=1,--e) ; + return *exp = e-14, half(binary_t(), static_cast((arg.data_&0x8000)|0x3800|(m&0x3FF))); + } + + /// Decompression implementation. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part + MEGDNN_HOST MEGDNN_DEVICE static half modf(half arg, half *iptr) + { + unsigned int e = arg.data_ & 0x7C00; + if(e > 0x6000) + return *iptr = arg, (e==0x7C00&&(arg.data_&0x3FF)) ? arg : half(binary_t(), arg.data_&0x8000); + if(e < 0x3C00) + return iptr->data_ = arg.data_ & 0x8000, arg; + e >>= 10; + unsigned int mask = (1<<(25-e)) - 1, m = arg.data_ & mask; + iptr->data_ = arg.data_ & ~mask; + if(!m) + return half(binary_t(), arg.data_&0x8000); + for(; m<0x400; m<<=1,--e) ; + return half(binary_t(), static_cast((arg.data_&0x8000)|(e<<10)|(m&0x3FF))); + } + + /// Scaling implementation. + /// \param arg number to scale + /// \param exp power of two to scale by + /// \return scaled number + MEGDNN_HOST MEGDNN_DEVICE static half scalbln(half arg, long exp) + { + long e = arg.data_ & 0x7C00; + if(e == 0x7C00) + return arg; + unsigned int m = arg.data_ & 0x3FF; + if(e >>= 10) + m |= 0x400; + else + { + if(!m) + return arg; + for(m<<=1; m<0x400; m<<=1,--e) ; + } + e += exp; + uint16 value = arg.data_ & 0x8000; + if(e > 30) + { + if(half::round_style == std::round_toward_zero) + value |= 0x7BFF; + else if(half::round_style == std::round_toward_infinity) + value |= 0x7C00 - (value>>15); + else if(half::round_style == std::round_toward_neg_infinity) + value |= 0x7BFF + (value>>15); + else + value |= 0x7C00; + } + else if(e > 0) + value |= (e<<10) | (m&0x3FF); + else if(e > -11) + { + if(half::round_style == std::round_to_nearest) + { + m += 1 << -e; + #if HALF_ROUND_TIES_TO_EVEN + m -= (m>>(1-e)) & 1; + #endif + } + else if(half::round_style == std::round_toward_infinity) + m += ((value>>15)-1) & ((1<<(1-e))-1U); + else if(half::round_style == std::round_toward_neg_infinity) + m += -(value>>15) & ((1<<(1-e))-1U); + value |= m >> (1-e); + } + else if(half::round_style == std::round_toward_infinity) + value |= ((value>>15)-1) & 1; + else if(half::round_style == std::round_toward_neg_infinity) + value |= value >> 15; + return half(binary_t(), value); + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + MEGDNN_HOST MEGDNN_DEVICE static int ilogb(half arg) + { + int exp = arg.data_ & 0x7FFF; + if(!exp) + return FP_ILOGB0; + if(exp < 0x7C00) + { + if(!(exp>>=10)) + for(unsigned int m=(arg.data_&0x3FF); m<0x200; m<<=1,--exp) ; + return exp - 15; + } + if(exp > 0x7C00) + return FP_ILOGBNAN; + return INT_MAX; + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + MEGDNN_HOST MEGDNN_DEVICE static half logb(half arg) + { + int exp = arg.data_ & 0x7FFF; + if(!exp) + return half(binary_t(), 0xFC00); + if(exp < 0x7C00) + { + if(!(exp>>=10)) + for(unsigned int m=(arg.data_&0x3FF); m<0x200; m<<=1,--exp) ; + return half(static_cast(exp-15)); + } + if(exp > 0x7C00) + return arg; + return half(binary_t(), 0x7C00); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + MEGDNN_HOST MEGDNN_DEVICE static half nextafter(half from, half to) + { + uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; + if(fabs > 0x7C00) + return from; + if(tabs > 0x7C00 || from.data_ == to.data_ || !(fabs|tabs)) + return to; + if(!fabs) + return half(binary_t(), (to.data_&0x8000)+1); + bool lt = (signbit(from) ? (static_cast(0x8000)-from.data_) : static_cast(from.data_)) < + (signbit(to) ? (static_cast(0x8000)-to.data_) : static_cast(to.data_)); + return half(binary_t(), from.data_+(((from.data_>>15)^static_cast(lt))<<1)-1); + } + + /// Sign implementation + /// \param x first operand + /// \param y second operand + /// \return composed value + MEGDNN_HOST MEGDNN_DEVICE static half copysign(half x, half y) { return half(binary_t(), x.data_^((x.data_^y.data_)&0x8000)); } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static int fpclassify(half arg) + { + unsigned int abs = arg.data_ & 0x7FFF; + if(abs > 0x7C00) + return FP_NAN; + if(abs == 0x7C00) + return FP_INFINITE; + if(abs > 0x3FF) + return FP_NORMAL; + return abs ? FP_SUBNORMAL : FP_ZERO; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if finite number + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if not a number + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if normal number + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); } + + /// Sign bit implementation. + /// \param arg value to check + /// \retval true if signed + /// \retval false if unsigned + MEGDNN_HOST MEGDNN_DEVICE static bool signbit(half arg) { return (arg.data_&0x8000) != 0; } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isequal(half x, half y) { return (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF)) && !isnan(x); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isnotequal(half x, half y) { return (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF)) || isnan(x); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x > \a y + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isgreater(half x, half y) { return !isnan(x) && !isnan(y) && ((signbit(x) ? (static_cast(0x8000)-x.data_) : + static_cast(x.data_)) > (signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_))); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x >= \a y + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isgreaterequal(half x, half y) { return !isnan(x) && !isnan(y) && ((signbit(x) ? (static_cast(0x8000)-x.data_) : + static_cast(x.data_)) >= (signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_))); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x < \a y + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isless(half x, half y) { return !isnan(x) && !isnan(y) && ((signbit(x) ? (static_cast(0x8000)-x.data_) : + static_cast(x.data_)) < (signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_))); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x <= \a y + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool islessequal(half x, half y) { return !isnan(x) && !isnan(y) && ((signbit(x) ? (static_cast(0x8000)-x.data_) : + static_cast(x.data_)) <= (signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_))); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true neither \a x > \a y nor \a x < \a y + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool islessgreater(half x, half y) + { + if(isnan(x) || isnan(y)) + return false; + int17 a = signbit(x) ? (static_cast(0x8000)-x.data_) : static_cast(x.data_); + int17 b = signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_); + return a < b || a > b; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operand unordered + /// \retval false else + MEGDNN_HOST MEGDNN_DEVICE static bool isunordered(half x, half y) { return isnan(x) || isnan(y); } + + private: + MEGDNN_HOST MEGDNN_DEVICE static double erf(double arg) + { + if(builtin_isinf(arg)) + return (arg<0.0) ? -1.0 : 1.0; + double x2 = static_cast(arg) * static_cast(arg), ax2 = 0.147 * x2; + //! \warning function \c exp and \c sqrt are defined in the + //! current file, the parameters of them are 'float', here use + //! static_cast may have some accuracy error, The same is the + //! function \c log used in \c lgamma. + double value = sqrt(1.0f-exp(static_cast(-x2*(1.2732395447351626861510701069801+ax2)/(1.0+ax2)))); + return builtin_signbit(arg) ? -value : value; + } + + MEGDNN_HOST MEGDNN_DEVICE static double lgamma(double arg) + { + double v = 1.0; + for(; arg<8.0; ++arg) v *= arg; + double w = 1.0 / (arg * arg); + return (((((((-0.02955065359477124183006535947712*w+0.00641025641025641025641025641026)*w+ + -0.00191752691752691752691752691753)*w+8.4175084175084175084175084175084e-4)*w+ + -5.952380952380952380952380952381e-4)*w+7.9365079365079365079365079365079e-4)*w+ + -0.00277777777777777777777777777778)*w+0.08333333333333333333333333333333)/arg + + 0.91893853320467274178032973640562 - log(static_cast(v)) - arg + (arg-0.5) * log(static_cast(arg)); + } + }; + + /// Wrapper for unary half-precision functions needing specialization for individual argument types. + /// \tparam T argument type + template struct unary_specialized + { + /// Negation implementation. + /// \param arg value to negate + /// \return negated value + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half negate(half arg) { return half(binary_t(), arg.data_^0x8000); } + + /// Absolute value implementation. + /// \param arg function argument + /// \return absolute value + MEGDNN_HOST MEGDNN_DEVICE static half fabs(half arg) { return half(binary_t(), arg.data_&0x7FFF); } + }; + template<> struct unary_specialized + { + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); } + MEGDNN_HOST MEGDNN_DEVICE static expr fabs(float arg) { +#if defined(__CUDA_ARCH__) + return expr(fabsf(arg)); +#else + return expr(std::fabs(arg)); +#endif + } + }; + + /// Wrapper for binary_t() half-precision functions needing specialization for individual argument types. + /// \tparam T first argument type + /// \tparam U first argument type + template struct binary_specialized + { + /// Minimum implementation. + /// \param x first operand + /// \param y second operand + /// \return minimum value + MEGDNN_HOST MEGDNN_DEVICE static expr fmin(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH || defined(__CUDA_ARCH__) + return expr(::fmin(x, y)); + #else + if(builtin_isnan(x)) + return expr(y); + if(builtin_isnan(y)) + return expr(x); + return expr(min(x, y)); + #endif + } + + /// Maximum implementation. + /// \param x first operand + /// \param y second operand + /// \return maximum value + MEGDNN_HOST MEGDNN_DEVICE static expr fmax(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH || defined(__CUDA_ARCH__) + return expr(::fmax(x, y)); + #else + if(builtin_isnan(x)) + return expr(y); + if(builtin_isnan(y)) + return expr(x); + return expr(max(x, y)); + #endif + } + }; + template<> struct binary_specialized + { + MEGDNN_HOST MEGDNN_DEVICE static half fmin(half x, half y) + { + if(functions::isnan(x)) + return y; + if(functions::isnan(y)) + return x; + return ((functions::signbit(x) ? (static_cast(0x8000)-x.data_) : static_cast(x.data_)) > + (functions::signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_))) ? y : x; + } + MEGDNN_HOST MEGDNN_DEVICE static half fmax(half x, half y) + { + if(functions::isnan(x)) + return y; + if(functions::isnan(y)) + return x; + return ((functions::signbit(x) ? (static_cast(0x8000)-x.data_) : static_cast(x.data_)) < + (functions::signbit(y) ? (static_cast(0x8000)-y.data_) : static_cast(y.data_))) ? y : x; + } + }; + + /// Helper class for half casts. + /// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member + /// function and a corresponding `type` member denoting its return type. + /// \tparam T destination type + /// \tparam U source type + /// \tparam R rounding mode to use + template struct half_caster {}; + template struct half_caster + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); + #endif + + typedef half type; + MEGDNN_HOST MEGDNN_DEVICE static half cast(U arg) { return cast_impl(arg, is_float()); }; + + private: + MEGDNN_HOST MEGDNN_DEVICE static half cast_impl(U arg, true_type) { return half(binary_t(), float2half(static_cast(arg))); } + MEGDNN_HOST MEGDNN_DEVICE static half cast_impl(U arg, false_type) { return half(binary_t(), int2half(arg)); } + }; + template struct half_caster + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); + #endif + + typedef T type; + template MEGDNN_HOST MEGDNN_DEVICE static T cast(U arg) { return cast_impl(arg, is_float()); } + + private: + MEGDNN_HOST MEGDNN_DEVICE static T cast_impl(float arg, true_type) { return static_cast(arg); } + MEGDNN_HOST MEGDNN_DEVICE static T cast_impl(half arg, false_type) { return half2int(arg.data_); } + }; + template struct half_caster : public half_caster {}; + template struct half_caster + { + typedef half type; + MEGDNN_HOST MEGDNN_DEVICE static half cast(half arg) { return arg; } + }; + template struct half_caster : public half_caster {}; + + /// \name Comparison operators + /// \{ + + /// Comparison for equality. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator==(T x, U y) { return functions::isequal(x, y); } + + /// Comparison for inequality. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator!=(T x, U y) { return functions::isnotequal(x, y); } + + /// Comparison for less than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less than \a y + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator<(T x, U y) { return functions::isless(x, y); } + + /// Comparison for greater than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater than \a y + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator>(T x, U y) { return functions::isgreater(x, y); } + + /// Comparison for less equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less equal \a y + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator<=(T x, U y) { return functions::islessequal(x, y); } + + /// Comparison for greater equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater equal \a y + /// \retval false else + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator>=(T x, U y) { return functions::isgreaterequal(x, y); } + + /// \} + /// \name Arithmetic operators + /// \{ + + /// Add halfs. + /// \param x left operand + /// \param y right operand + /// \return sum of half expressions + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator+(T x, U y) { return functions::plus(x, y); } + + /// Subtract halfs. + /// \param x left operand + /// \param y right operand + /// \return difference of half expressions + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator-(T x, U y) { return functions::minus(x, y); } + + /// Multiply halfs. + /// \param x left operand + /// \param y right operand + /// \return product of half expressions + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator*(T x, U y) { return functions::multiplies(x, y); } + + /// Divide halfs. + /// \param x left operand + /// \param y right operand + /// \return quotient of half expressions + template MEGDNN_HOST MEGDNN_DEVICE typename enable::type operator/(T x, U y) { return functions::divides(x, y); } + + /// Identity. + /// \param arg operand + /// \return uncahnged operand + template MEGDNN_HOST MEGDNN_DEVICE HALF_CONSTEXPR typename enable::type operator+(T arg) { return arg; } + + /// Negation. + /// \param arg operand + /// \return negated operand + template MEGDNN_HOST MEGDNN_DEVICE HALF_CONSTEXPR typename enable::type operator-(T arg) { return unary_specialized::negate(arg); } + + /// \} + /// \name Input and output + /// \{ + + /// Output operator. + /// \param out output stream to write into + /// \param arg half expression to write + /// \return reference to output stream + template typename enable&,T>::type + operator<<(std::basic_ostream &out, T arg) { return functions::write(out, arg); } + + /// Input operator. + /// \param in input stream to read from + /// \param arg half to read into + /// \return reference to input stream + template std::basic_istream& + operator>>(std::basic_istream &in, half &arg) { return functions::read(in, arg); } + + /// \} + /// \name Basic mathematical operations + /// \{ + + /// Absolute value. + /// \param arg operand + /// \return absolute value of \a arg +// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half abs(half arg) { return unary_specialized::fabs(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr abs(expr arg) { return unary_specialized::fabs(arg); } + + /// Absolute value. + /// \param arg operand + /// \return absolute value of \a arg +// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half fabs(half arg) { return unary_specialized::fabs(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fabs(expr arg) { return unary_specialized::fabs(arg); } + + /// Remainder of division. + /// \param x first operand + /// \param y second operand + /// \return remainder of floating point division. +// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmod(half x, half y) { return functions::fmod(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmod(half x, expr y) { return functions::fmod(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmod(expr x, half y) { return functions::fmod(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmod(expr x, expr y) { return functions::fmod(x, y); } + + /// Remainder of division. + /// \param x first operand + /// \param y second operand + /// \return remainder of floating point division. +// template typename enable::type remainder(T x, U y) { return functions::remainder(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remainder(half x, half y) { return functions::remainder(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remainder(half x, expr y) { return functions::remainder(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remainder(expr x, half y) { return functions::remainder(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remainder(expr x, expr y) { return functions::remainder(x, y); } + + /// Remainder of division. + /// \param x first operand + /// \param y second operand + /// \param quo address to store some bits of quotient at + /// \return remainder of floating point division. +// template typename enable::type remquo(T x, U y, int *quo) { return functions::remquo(x, y, quo); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remquo(half x, half y, int *quo) { return functions::remquo(x, y, quo); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remquo(half x, expr y, int *quo) { return functions::remquo(x, y, quo); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remquo(expr x, half y, int *quo) { return functions::remquo(x, y, quo); } + MEGDNN_HOST MEGDNN_DEVICE inline expr remquo(expr x, expr y, int *quo) { return functions::remquo(x, y, quo); } + + /// Fused multiply add. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return ( \a x * \a y ) + \a z rounded as one operation. +// template typename enable::type fma(T x, U y, V z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); } + + /// Maximum of half expressions. + /// \param x first operand + /// \param y second operand + /// \return maximum of operands +// template typename result::type fmax(T x, U y) { return binary_specialized::fmax(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline half fmax(half x, half y) { return binary_specialized::fmax(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmax(half x, expr y) { return binary_specialized::fmax(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmax(expr x, half y) { return binary_specialized::fmax(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmax(expr x, expr y) { return binary_specialized::fmax(x, y); } + + /// Minimum of half expressions. + /// \param x first operand + /// \param y second operand + /// \return minimum of operands +// template typename result::type fmin(T x, U y) { return binary_specialized::fmin(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline half fmin(half x, half y) { return binary_specialized::fmin(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmin(half x, expr y) { return binary_specialized::fmin(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmin(expr x, half y) { return binary_specialized::fmin(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fmin(expr x, expr y) { return binary_specialized::fmin(x, y); } + + /// Positive difference. + /// \param x first operand + /// \param y second operand + /// \return \a x - \a y or 0 if difference negative +// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fdim(half x, half y) { return functions::fdim(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fdim(half x, expr y) { return functions::fdim(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fdim(expr x, half y) { return functions::fdim(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr fdim(expr x, expr y) { return functions::fdim(x, y); } + + /// Get NaN value. + /// \param arg descriptive string (ignored) + /// \return quiet NaN + MEGDNN_HOST MEGDNN_DEVICE inline half nanh(const char *arg) { return functions::nanh(arg); } + + /// \} + /// \name Exponential functions + /// \{ + + /// Exponential function. + /// \param arg function argument + /// \return e raised to \a arg +// template typename enable::type exp(T arg) { return functions::exp(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr exp(half arg) { return functions::exp(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr exp(expr arg) { return functions::exp(arg); } + + /// Exponential minus one. + /// \param arg function argument + /// \return e raised to \a arg subtracted by 1 +// template typename enable::type expm1(T arg) { return functions::expm1(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr expm1(half arg) { return functions::expm1(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr expm1(expr arg) { return functions::expm1(arg); } + + /// Binary exponential. + /// \param arg function argument + /// \return 2 raised to \a arg +// template typename enable::type exp2(T arg) { return functions::exp2(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr exp2(half arg) { return functions::exp2(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr exp2(expr arg) { return functions::exp2(arg); } + + /// Natural logorithm. + /// \param arg function argument + /// \return logarithm of \a arg to base e +// template typename enable::type log(T arg) { return functions::log(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log(half arg) { return functions::log(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log(expr arg) { return functions::log(arg); } + + /// Common logorithm. + /// \param arg function argument + /// \return logarithm of \a arg to base 10 +// template typename enable::type log10(T arg) { return functions::log10(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log10(half arg) { return functions::log10(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log10(expr arg) { return functions::log10(arg); } + + /// Natural logorithm. + /// \param arg function argument + /// \return logarithm of \a arg plus 1 to base e +// template typename enable::type log1p(T arg) { return functions::log1p(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log1p(half arg) { return functions::log1p(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log1p(expr arg) { return functions::log1p(arg); } + + /// Binary logorithm. + /// \param arg function argument + /// \return logarithm of \a arg to base 2 +// template typename enable::type log2(T arg) { return functions::log2(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log2(half arg) { return functions::log2(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr log2(expr arg) { return functions::log2(arg); } + + /// \} + /// \name Power functions + /// \{ + + /// Square root. + /// \param arg function argument + /// \return square root of \a arg +// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr sqrt(half arg) { return functions::sqrt(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr sqrt(expr arg) { return functions::sqrt(arg); } + + /// Cubic root. + /// \param arg function argument + /// \return cubic root of \a arg +// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr cbrt(half arg) { return functions::cbrt(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr cbrt(expr arg) { return functions::cbrt(arg); } + + /// Hypotenuse function. + /// \param x first argument + /// \param y second argument + /// \return square root of sum of squares without internal over- or underflows +// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr hypot(half x, half y) { return functions::hypot(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr hypot(half x, expr y) { return functions::hypot(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr hypot(expr x, half y) { return functions::hypot(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr hypot(expr x, expr y) { return functions::hypot(x, y); } + + /// Power function. + /// \param base first argument + /// \param exp second argument + /// \return \a base raised to \a exp +// template typename enable::type pow(T base, U exp) { return functions::pow(base, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline expr pow(half base, half exp) { return functions::pow(base, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline expr pow(half base, expr exp) { return functions::pow(base, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline expr pow(expr base, half exp) { return functions::pow(base, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline expr pow(expr base, expr exp) { return functions::pow(base, exp); } + + /// \} + /// \name Trigonometric functions + /// \{ + + /// Sine function. + /// \param arg function argument + /// \return sine value of \a arg +// template typename enable::type sin(T arg) { return functions::sin(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr sin(half arg) { return functions::sin(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr sin(expr arg) { return functions::sin(arg); } + + /// Cosine function. + /// \param arg function argument + /// \return cosine value of \a arg +// template typename enable::type cos(T arg) { return functions::cos(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr cos(half arg) { return functions::cos(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr cos(expr arg) { return functions::cos(arg); } + + /// Tangent function. + /// \param arg function argument + /// \return tangent value of \a arg +// template typename enable::type tan(T arg) { return functions::tan(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr tan(half arg) { return functions::tan(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr tan(expr arg) { return functions::tan(arg); } + + /// Arc sine. + /// \param arg function argument + /// \return arc sine value of \a arg +// template typename enable::type asin(T arg) { return functions::asin(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr asin(half arg) { return functions::asin(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr asin(expr arg) { return functions::asin(arg); } + + /// Arc cosine function. + /// \param arg function argument + /// \return arc cosine value of \a arg +// template typename enable::type acos(T arg) { return functions::acos(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr acos(half arg) { return functions::acos(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr acos(expr arg) { return functions::acos(arg); } + + /// Arc tangent function. + /// \param arg function argument + /// \return arc tangent value of \a arg +// template typename enable::type atan(T arg) { return functions::atan(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atan(half arg) { return functions::atan(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atan(expr arg) { return functions::atan(arg); } + + /// Arc tangent function. + /// \param x first argument + /// \param y second argument + /// \return arc tangent value +// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atan2(half x, half y) { return functions::atan2(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atan2(half x, expr y) { return functions::atan2(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atan2(expr x, half y) { return functions::atan2(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atan2(expr x, expr y) { return functions::atan2(x, y); } + + /// \} + /// \name Hyperbolic functions + /// \{ + + /// Hyperbolic sine. + /// \param arg function argument + /// \return hyperbolic sine value of \a arg +// template typename enable::type sinh(T arg) { return functions::sinh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr sinh(half arg) { return functions::sinh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr sinh(expr arg) { return functions::sinh(arg); } + + /// Hyperbolic cosine. + /// \param arg function argument + /// \return hyperbolic cosine value of \a arg +// template typename enable::type cosh(T arg) { return functions::cosh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr cosh(half arg) { return functions::cosh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr cosh(expr arg) { return functions::cosh(arg); } + + /// Hyperbolic tangent. + /// \param arg function argument + /// \return hyperbolic tangent value of \a arg +// template typename enable::type tanh(T arg) { return functions::tanh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr tanh(half arg) { return functions::tanh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr tanh(expr arg) { return functions::tanh(arg); } + + /// Hyperbolic area sine. + /// \param arg function argument + /// \return area sine value of \a arg +// template typename enable::type asinh(T arg) { return functions::asinh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr asinh(half arg) { return functions::asinh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr asinh(expr arg) { return functions::asinh(arg); } + + /// Hyperbolic area cosine. + /// \param arg function argument + /// \return area cosine value of \a arg +// template typename enable::type acosh(T arg) { return functions::acosh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr acosh(half arg) { return functions::acosh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr acosh(expr arg) { return functions::acosh(arg); } + + /// Hyperbolic area tangent. + /// \param arg function argument + /// \return area tangent value of \a arg +// template typename enable::type atanh(T arg) { return functions::atanh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atanh(half arg) { return functions::atanh(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr atanh(expr arg) { return functions::atanh(arg); } + + /// \} + /// \name Error and gamma functions + /// \{ + + /// Error function. + /// \param arg function argument + /// \return error function value of \a arg +// template typename enable::type erf(T arg) { return functions::erf(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr erf(half arg) { return functions::erf(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr erf(expr arg) { return functions::erf(arg); } + + /// Complementary error function. + /// \param arg function argument + /// \return 1 minus error function value of \a arg +// template typename enable::type erfc(T arg) { return functions::erfc(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr erfc(half arg) { return functions::erfc(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr erfc(expr arg) { return functions::erfc(arg); } + + /// Natural logarithm of gamma function. + /// \param arg function argument + /// \return natural logarith of gamma function for \a arg +// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr lgamma(half arg) { return functions::lgamma(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr lgamma(expr arg) { return functions::lgamma(arg); } + + /// Gamma function. + /// \param arg function argument + /// \return gamma function value of \a arg +// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr tgamma(half arg) { return functions::tgamma(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline expr tgamma(expr arg) { return functions::tgamma(arg); } + + /// \} + /// \name Rounding + /// \{ + + /// Nearest integer not less than half value. + /// \param arg half to round + /// \return nearest integer not less than \a arg +// template typename enable::type ceil(T arg) { return functions::ceil(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half ceil(half arg) { return functions::ceil(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half ceil(expr arg) { return functions::ceil(arg); } + + /// Nearest integer not greater than half value. + /// \param arg half to round + /// \return nearest integer not greater than \a arg +// template typename enable::type floor(T arg) { return functions::floor(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half floor(half arg) { return functions::floor(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half floor(expr arg) { return functions::floor(arg); } + + /// Nearest integer not greater in magnitude than half value. + /// \param arg half to round + /// \return nearest integer not greater in magnitude than \a arg +// template typename enable::type trunc(T arg) { return functions::trunc(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half trunc(half arg) { return functions::trunc(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half trunc(expr arg) { return functions::trunc(arg); } + + /// Nearest integer. + /// \param arg half to round + /// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type round(T arg) { return functions::round(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half round(half arg) { return functions::round(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half round(expr arg) { return functions::round(arg); } + + /// Nearest integer. + /// \param arg half to round + /// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type lround(T arg) { return functions::lround(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long lround(half arg) { return functions::lround(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long lround(expr arg) { return functions::lround(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half nearbyint(half arg) { return functions::rint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half nearbyint(expr arg) { return functions::rint(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type rint(T arg) { return functions::rint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half rint(half arg) { return functions::rint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half rint(expr arg) { return functions::rint(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type lrint(T arg) { return functions::lrint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long lrint(half arg) { return functions::lrint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long lrint(expr arg) { return functions::lrint(arg); } + #if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer. + /// \param arg half to round + /// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type llround(T arg) { return functions::llround(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long long llround(half arg) { return functions::llround(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long long llround(expr arg) { return functions::llround(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type llrint(T arg) { return functions::llrint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long long llrint(half arg) { return functions::llrint(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline long long llrint(expr arg) { return functions::llrint(arg); } + #endif + + /// \} + /// \name Floating point manipulation + /// \{ + + /// Decompress floating point number. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return significant in range [0.5, 1) +// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half frexp(half arg, int *exp) { return functions::frexp(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half frexp(expr arg, int *exp) { return functions::frexp(arg, exp); } + + /// Multiply by power of two. + /// \param arg number to modify + /// \param exp power of two to multiply with + /// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); } + + /// Extract integer and fractional parts. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part +// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); } + MEGDNN_HOST MEGDNN_DEVICE inline half modf(half arg, half *iptr) { return functions::modf(arg, iptr); } + MEGDNN_HOST MEGDNN_DEVICE inline half modf(expr arg, half *iptr) { return functions::modf(arg, iptr); } + + /// Multiply by power of two. + /// \param arg number to modify + /// \param exp power of two to multiply with + /// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); } + + /// Multiply by power of two. + /// \param arg number to modify + /// \param exp power of two to multiply with + /// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); } + MEGDNN_HOST MEGDNN_DEVICE inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); } + + /// Extract exponent. + /// \param arg number to query + /// \return floating point exponent + /// \retval FP_ILOGB0 for zero + /// \retval FP_ILOGBNAN for NaN + /// \retval MAX_INT for infinity +// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline int ilogb(half arg) { return functions::ilogb(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline int ilogb(expr arg) { return functions::ilogb(arg); } + + /// Extract exponent. + /// \param arg number to query + /// \return floating point exponent +// template typename enable::type logb(T arg) { return functions::logb(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half logb(half arg) { return functions::logb(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline half logb(expr arg) { return functions::logb(arg); } + + /// Next representable value. + /// \param from value to compute next representable value for + /// \param to direction towards which to compute next value + /// \return next representable value after \a from in direction towards \a to +// template typename enable::type nextafter(T from, U to) { return functions::nextafter(from, to); } + MEGDNN_HOST MEGDNN_DEVICE inline half nextafter(half from, half to) { return functions::nextafter(from, to); } + MEGDNN_HOST MEGDNN_DEVICE inline half nextafter(half from, expr to) { return functions::nextafter(from, to); } + MEGDNN_HOST MEGDNN_DEVICE inline half nextafter(expr from, half to) { return functions::nextafter(from, to); } + MEGDNN_HOST MEGDNN_DEVICE inline half nextafter(expr from, expr to) { return functions::nextafter(from, to); } + + /// Take sign. + /// \param x value to change sign for + /// \param y value to take sign from + /// \return value equal to \a x in magnitude and to \a y in sign +// template typename enable::type copysign(T x, U y) { return functions::copysign(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline half copysign(half x, half y) { return functions::copysign(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline half copysign(half x, expr y) { return functions::copysign(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline half copysign(expr x, half y) { return functions::copysign(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline half copysign(expr x, expr y) { return functions::copysign(x, y); } + + /// \} + /// \name Floating point classification + /// \{ + + + /// Classify floating point value. + /// \param arg number to classify + /// \retval FP_ZERO for positive and negative zero + /// \retval FP_SUBNORMAL for subnormal numbers + /// \retval FP_INFINITY for positive and negative infinity + /// \retval FP_NAN for NaNs + /// \retval FP_NORMAL for all other (normal) values +// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline int fpclassify(half arg) { return functions::fpclassify(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline int fpclassify(expr arg) { return functions::fpclassify(arg); } + + /// Check if finite number. + /// \param arg number to check + /// \retval true if neither infinity nor NaN + /// \retval false else +// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isfinite(half arg) { return functions::isfinite(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isfinite(expr arg) { return functions::isfinite(arg); } + + /// Check for infinity. + /// \param arg number to check + /// \retval true for positive or negative infinity + /// \retval false else +// template typename enable::type isinf(T arg) { return functions::isinf(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isinf(half arg) { return functions::isinf(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isinf(expr arg) { return functions::isinf(arg); } + + /// Check for NaN. + /// \param arg number to check + /// \retval true for NaNs + /// \retval false else +// template typename enable::type isnan(T arg) { return functions::isnan(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isnan(half arg) { return functions::isnan(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isnan(expr arg) { return functions::isnan(arg); } + + /// Check if normal number. + /// \param arg number to check + /// \retval true if normal number + /// \retval false if either subnormal, zero, infinity or NaN +// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isnormal(half arg) { return functions::isnormal(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isnormal(expr arg) { return functions::isnormal(arg); } + + /// Check sign. + /// \param arg number to check + /// \retval true for negative number + /// \retval false for positive number +// template typename enable::type signbit(T arg) { return functions::signbit(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool signbit(half arg) { return functions::signbit(arg); } + MEGDNN_HOST MEGDNN_DEVICE inline bool signbit(expr arg) { return functions::signbit(arg); } + + /// \} + /// \name Comparison + /// \{ + + /// Comparison for greater than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater than \a y + /// \retval false else +// template typename enable::type isgreater(T x, U y) { return functions::isgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreater(half x, half y) { return functions::isgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); } + + /// Comparison for greater equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater equal \a y + /// \retval false else +// template typename enable::type isgreaterequal(T x, U y) { return functions::isgreaterequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreaterequal(half x, half y) { return functions::isgreaterequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreaterequal(half x, expr y) { return functions::isgreaterequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreaterequal(expr x, half y) { return functions::isgreaterequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isgreaterequal(expr x, expr y) { return functions::isgreaterequal(x, y); } + + /// Comparison for less than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less than \a y + /// \retval false else +// template typename enable::type isless(T x, U y) { return functions::isless(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isless(half x, half y) { return functions::isless(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isless(half x, expr y) { return functions::isless(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isless(expr x, half y) { return functions::isless(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isless(expr x, expr y) { return functions::isless(x, y); } + + /// Comparison for less equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less equal \a y + /// \retval false else +// template typename enable::type islessequal(T x, U y) { return functions::islessequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessequal(half x, half y) { return functions::islessequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); } + + /// Comarison for less or greater. + /// \param x first operand + /// \param y second operand + /// \retval true if either less or greater + /// \retval false else +// template typename enable::type islessgreater(T x, U y) { return functions::islessgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessgreater(half x, half y) { return functions::islessgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessgreater(half x, expr y) { return functions::islessgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessgreater(expr x, half y) { return functions::islessgreater(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool islessgreater(expr x, expr y) { return functions::islessgreater(x, y); } + + /// Check if unordered. + /// \param x first operand + /// \param y second operand + /// \retval true if unordered (one or two NaN operands) + /// \retval false else +// template typename enable::type isunordered(T x, U y) { return functions::isunordered(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isunordered(half x, half y) { return functions::isunordered(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); } + MEGDNN_HOST MEGDNN_DEVICE inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); } + + /// \name Casting + /// \{ + + /// Cast to or from half-precision floating point number. + /// This casts between [half](\ref half_float::half) and any built-in arithmetic type. Floating point types are + /// converted via an explicit cast to/from `float` (using the rounding mode of the built-in single precision + /// implementation) and thus any possible warnings due to an otherwise implicit conversion to/from `float` will be + /// suppressed. Integer types are converted directly using the given rounding mode, without any roundtrip over `float` + /// that a `static_cast` would otherwise do. It uses the default rounding mode. + /// + /// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types + /// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler + /// error and casting between [half](\ref half_float::half)s is just a no-op. + /// \tparam T destination type (half or built-in arithmetic type) + /// \tparam U source type (half or built-in arithmetic type) + /// \param arg value to cast + /// \return \a arg converted to destination type + template MEGDNN_HOST MEGDNN_DEVICE typename half_caster::type half_cast(U arg) { return half_caster::cast(arg); } + + /// Cast to or from half-precision floating point number. + /// This casts between [half](\ref half_float::half) and any built-in arithmetic type. Floating point types are + /// converted via an explicit cast to/from `float` (using the rounding mode of the built-in single precision + /// implementation) and thus any possible warnings due to an otherwise implicit conversion to/from `float` will be + /// suppressed. Integer types are converted directly using the given rounding mode, without any roundtrip over `float` + /// that a `static_cast` would otherwise do. + /// + /// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types + /// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler + /// error and casting between [half](\ref half_float::half)s is just a no-op. + /// \tparam T destination type (half or built-in arithmetic type) + /// \tparam R rounding mode to use. + /// \tparam U source type (half or built-in arithmetic type) + /// \param arg value to cast + /// \return \a arg converted to destination type + template MEGDNN_HOST MEGDNN_DEVICE typename half_caster::type half_cast(U arg) + { return half_caster::cast(arg); } + /// \} + } + + using detail::operator==; + using detail::operator!=; + using detail::operator<; + using detail::operator>; + using detail::operator<=; + using detail::operator>=; + using detail::operator+; + using detail::operator-; + using detail::operator*; + using detail::operator/; + using detail::operator<<; + using detail::operator>>; + + using detail::abs; + using detail::fabs; + using detail::fmod; + using detail::remainder; + using detail::remquo; + using detail::fma; + using detail::fmax; + using detail::fmin; + using detail::fdim; + using detail::nanh; + using detail::exp; + using detail::expm1; + using detail::exp2; + using detail::log; + using detail::log10; + using detail::log1p; + using detail::log2; + using detail::sqrt; + using detail::cbrt; + using detail::hypot; + using detail::pow; + using detail::sin; + using detail::cos; + using detail::tan; + using detail::asin; + using detail::acos; + using detail::atan; + using detail::atan2; + using detail::sinh; + using detail::cosh; + using detail::tanh; + using detail::asinh; + using detail::acosh; + using detail::atanh; + using detail::erf; + using detail::erfc; + using detail::lgamma; + using detail::tgamma; + using detail::ceil; + using detail::floor; + using detail::trunc; + using detail::round; + using detail::lround; + using detail::nearbyint; + using detail::rint; + using detail::lrint; +#if HALF_ENABLE_CPP11_LONG_LONG + using detail::llround; + using detail::llrint; +#endif + using detail::frexp; + using detail::ldexp; + using detail::modf; + using detail::scalbn; + using detail::scalbln; + using detail::ilogb; + using detail::logb; + using detail::nextafter; + using detail::copysign; + using detail::fpclassify; + using detail::isfinite; + using detail::isinf; + using detail::isnan; + using detail::isnormal; + using detail::signbit; + using detail::isgreater; + using detail::isgreaterequal; + using detail::isless; + using detail::islessequal; + using detail::islessgreater; + using detail::isunordered; + + using detail::half_cast; +} + +/// Extensions to the C++ standard library. +namespace std +{ + /// Numeric limits for half-precision floats. + /// Because of the underlying single-precision implementation of many operations, it inherits some properties from + /// `numeric_limits`. + template<> class numeric_limits : public numeric_limits + { + public: + /// Supports signed values. + static HALF_CONSTEXPR_CONST bool is_signed = true; + + /// Is not exact. + static HALF_CONSTEXPR_CONST bool is_exact = false; + + /// Doesn't provide modulo arithmetic. + static HALF_CONSTEXPR_CONST bool is_modulo = false; + + /// IEEE conformant. + static HALF_CONSTEXPR_CONST bool is_iec559 = true; + + /// Supports infinity. + static HALF_CONSTEXPR_CONST bool has_infinity = true; + + /// Supports quiet NaNs. + static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; + + /// Supports subnormal values. + static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; + + /// Rounding mode. + /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying + /// single-precision implementation) with explicit truncation of the single-to-half conversions, the actual rounding + /// mode is indeterminate. + static HALF_CONSTEXPR_CONST float_round_style round_style = (numeric_limits::round_style== + half_float::half::round_style) ? half_float::half::round_style : round_indeterminate; + + /// Significant digits. + static HALF_CONSTEXPR_CONST int digits = 11; + + /// Significant decimal digits. + static HALF_CONSTEXPR_CONST int digits10 = 3; + + /// Required decimal digits to represent all possible values. + static HALF_CONSTEXPR_CONST int max_digits10 = 5; + + /// Number base. + static HALF_CONSTEXPR_CONST int radix = 2; + + /// One more than smallest exponent. + static HALF_CONSTEXPR_CONST int min_exponent = -13; + + /// Smallest normalized representable power of 10. + static HALF_CONSTEXPR_CONST int min_exponent10 = -4; + + /// One more than largest exponent + static HALF_CONSTEXPR_CONST int max_exponent = 16; + + /// Largest finitely representable power of 10. + static HALF_CONSTEXPR_CONST int max_exponent10 = 4; + + /// Smallest positive normal value. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x0400); } + + /// Smallest finite value. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0xFBFF); } + + /// Largest finite value. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x7BFF); } + + /// Difference between one and next representable value. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x1400); } + + /// Maximum rounding error. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW + { return half_float::half(half_float::detail::binary_t(), (round_style==round_to_nearest) ? 0x3800 : 0x3C00); } + + /// Positive infinity. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x7C00); } + + /// Quiet NaN. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x7FFF); } + + /// Signalling NaN. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x7DFF); } + + /// Smallest positive subnormal value. + MEGDNN_HOST MEGDNN_DEVICE static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary_t(), 0x0001); } + }; + +#ifdef MEGDNN_CC_HOST +#if HALF_ENABLE_CPP11_HASH + /// Hash function for half-precision floats. + /// This is only defined if C++11 `hash` is supported and enabled. + template<> struct hash //: unary_function + { + /// Type of function argument. + typedef half_float::half argument_type; + + /// Function return type. + typedef size_t result_type; + + /// Compute hash function. + /// \param arg half to hash + /// \return hash value + MEGDNN_HOST MEGDNN_DEVICE result_type operator()(argument_type arg) const + { return hash()(static_cast(arg.data_)&-(arg.data_!=0x8000)); } + }; +#endif +#endif +} + + +#undef HALF_CONSTEXPR +#undef HALF_CONSTEXPR_CONST +#undef HALF_NOEXCEPT +#undef HALF_NOTHROW +#ifdef HALF_POP_WARNINGS + #pragma warning(pop) + #undef HALF_POP_WARNINGS +#endif + +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/handle.h b/dnn/include/megdnn/handle.h new file mode 100644 index 00000000..a84ac2f4 --- /dev/null +++ b/dnn/include/megdnn/handle.h @@ -0,0 +1,148 @@ +/** + * \file dnn/include/megdnn/handle.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megcore.h" +#include "megdnn/config/config.h" +#include "megdnn/basic_types.h" + +#include +#include + +#include "megdnn/internal/visibility_prologue.h" +namespace megdnn { + +class OperatorBase; + +class Handle { + public: + enum class HandleType { + NAIVE = 0, + FALLBACK = 1, + X86 = 2, + CUDA = 6, + }; + + protected: + Handle(megcoreComputingHandle_t computing_handle, HandleType type); + + public: + /** + * \brief Create a MegDNN handle from a MegCore Computing handle. + * + * \param[in] computing_handle MegCore computing handle. Please note + * that computing_handle would not be released when this Handle is + * destructed + * \param[in] debug_level + * Applicable for CPU computing handle. + * 0 means taking the fastest possible code path; it may contains + * platform-specific instructions such as SSE for x86_64 or NEON for + * armv7v7. + * 1 means taking the fastest possible code path without + * platform-specific instructions in C++ code. Note that the compiled + * binary file still contains platform-specific codes. + * 2 means taking the naive code path. Performance is severely + * hampered, but it is less error-prone since the internal + * implementation is rather straightforward. + * + * **Debug level 1 and 2 should not be used in productions.** + */ + static std::unique_ptr make( + megcoreComputingHandle_t computing_handle, + int debug_level = 0); + +#if MEGDNN_WITH_CUDA + static std::unique_ptr make_cuda_handle( + megcoreComputingHandle_t computing_handle); + template + std::unique_ptr create_cuda_operator(); +#endif + + virtual ~Handle(); + + /*! + * \brief Get the underlying megcore computing handle. + */ + megcoreComputingHandle_t megcore_computing_handle() const { + return m_computing_handle; + } + + /*! + * \brief set a callback function to be invoked when this handle is + * destructed, so associated resources can be released (e.g. + * computing handle) + * + * This function can be called at most once. + */ + void set_destructor(const thin_function &d); + + /*! + * \brief set a callback to be invoked when an operator is destructed + * \param[in,out] cb the callback function; it would be set to the + * previous callback function + */ + void set_opr_destruct_callback(thin_function &cb) { + cb.swap(m_on_opr_destructed); + } + + void on_opr_destructed(OperatorBase* opr); + + /** + * \brief Create operator of Opr type. + */ + template + std::unique_ptr create_operator(); + + /* + * ============================================================= + * Users should call functions below to query memory requirement. + * ============================================================= + */ + + /** + * \brief The internal data pointer of TensorND should be aligned to + * alignment_requirement() in bytes. + */ + virtual size_t alignment_requirement() const; + + //! get alignment in bytes for rows of image 2D tensor format + virtual size_t image2d_pitch_alignment() const; + + HandleType type() const { + return m_handle_type; + } + + /** + * \brief Check is the layout satisfy cross device copy constraint. + * 1. The handle of the src and the dst is the same kind + * 2. The dst is continguous. + */ + virtual bool check_cross_dev_copy_constraint(const TensorLayout &src); + + private: + static constexpr uint32_t ALIVE_MAGIC = 0x8595e9d2u; + volatile uint32_t m_alive_magic = ALIVE_MAGIC; + megcoreComputingHandle_t m_computing_handle; + const HandleType m_handle_type; + thin_function m_destructor; + thin_function m_on_opr_destructed; + + Handle() = delete; + Handle(const Handle &rhs) = delete; + Handle &operator=(const Handle &rhs) = delete; +}; + +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/internal/defs.h b/dnn/include/megdnn/internal/defs.h new file mode 100644 index 00000000..60bb8144 --- /dev/null +++ b/dnn/include/megdnn/internal/defs.h @@ -0,0 +1,35 @@ +/** + * \file dnn/include/megdnn/internal/defs.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#define MEGDNN_MAX_NDIM 7 + +/*! + * \brief iterate through small (usually used) ndim values + */ +#define MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb, ...) \ + cb(1 ,##__VA_ARGS__) cb(2 ,##__VA_ARGS__) cb(3 ,##__VA_ARGS__) + +/*! + * \brief iterate through large (rarely used) ndim values + */ +#define MEGDNN_FOREACH_TENSOR_NDIM_LARGE(cb, ...) \ + cb(4 ,##__VA_ARGS__) cb(5 ,##__VA_ARGS__) cb(6 ,##__VA_ARGS__) \ + cb(7, ##__VA_ARGS__) + +/*! + * \brief iterate through all ndim values + */ +#define MEGDNN_FOREACH_TENSOR_NDIM(cb, ...) \ + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb ,##__VA_ARGS__) \ + MEGDNN_FOREACH_TENSOR_NDIM_LARGE(cb ,##__VA_ARGS__) + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/internal/opr_header_epilogue.h b/dnn/include/megdnn/internal/opr_header_epilogue.h new file mode 100644 index 00000000..75898ac3 --- /dev/null +++ b/dnn/include/megdnn/internal/opr_header_epilogue.h @@ -0,0 +1,19 @@ +/** + * \file dnn/include/megdnn/internal/opr_header_epilogue.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// intentional no header guard here + +#undef DEF_OPR_PARAM +#undef DEF_OPR_IMPL +#undef DEF_OPR_IMPL_CTOR + +#include "./visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/internal/opr_header_prologue.h b/dnn/include/megdnn/internal/opr_header_prologue.h new file mode 100644 index 00000000..9331c0bf --- /dev/null +++ b/dnn/include/megdnn/internal/opr_header_prologue.h @@ -0,0 +1,64 @@ +/** + * \file dnn/include/megdnn/internal/opr_header_prologue.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// intentional no header guard here + +#include "megdnn/handle.h" +#include "megdnn/oprs/base.h" +#include "megdnn/opr_param_defs.h" +#include "megdnn/opr_result_defs.h" + +#include "./visibility_prologue.h" + +#include +#include + +#ifndef _megdnn_in +#define _megdnn_in +#endif + +#ifndef _megdnn_out +#define _megdnn_out +#endif + +#ifndef _megdnn_tensor_in +#define _megdnn_tensor_in const TensorND & +#endif + +#ifndef _megdnn_tensor_out +#define _megdnn_tensor_out const TensorND & +#endif + +#ifndef _megdnn_tensor_inout +#define _megdnn_tensor_inout const TensorND & +#endif + +#ifndef _megdnn_workspace +#define _megdnn_workspace const Workspace & +#endif + +#define DEF_OPR_IMPL_CTOR(_opr_name, _base_name) \ + public: \ + _opr_name(Handle *handle): _base_name(handle) {} \ + +#define DEF_OPR_IMPL(_opr_name, _base_name, _nr_inputs, _nr_outputs) \ + DEF_OPR_IMPL_CTOR(_opr_name, _base_name) \ + static MEGDNN_CONSTEXPR int NR_INPUTS = _nr_inputs; \ + static MEGDNN_CONSTEXPR int NR_OUTPUTS = _nr_outputs; \ + +#define DEF_OPR_PARAM(_pname) \ + public: \ + using Param = param::_pname; \ + Param& param() { return m_param; } \ + const Param& param() const { return m_param; } \ + protected: \ + Param m_param + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/internal/visibility_epilogue.h b/dnn/include/megdnn/internal/visibility_epilogue.h new file mode 100644 index 00000000..b40ce906 --- /dev/null +++ b/dnn/include/megdnn/internal/visibility_epilogue.h @@ -0,0 +1,23 @@ +/** + * \file dnn/include/megdnn/internal/visibility_epilogue.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#if MEGDNN_SHARED_LIB +#pragma GCC visibility pop +#endif + +#ifdef MEGDNN_VISIBILITY_PROLOGUE_INCLUDED +#undef MEGDNN_VISIBILITY_PROLOGUE_INCLUDED +#else +#error "visibility_epilogue.h must be included after visibility_prologue.h" +#endif + +// vim: syntax=cpp.doxygen + diff --git a/dnn/include/megdnn/internal/visibility_prologue.h b/dnn/include/megdnn/internal/visibility_prologue.h new file mode 100644 index 00000000..5c13f00d --- /dev/null +++ b/dnn/include/megdnn/internal/visibility_prologue.h @@ -0,0 +1,22 @@ +/** + * \file dnn/include/megdnn/internal/visibility_prologue.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#ifdef MEGDNN_VISIBILITY_PROLOGUE_INCLUDED +#error "visibility_prologue.h included twice without including visibility_epilogue.h" +#else +#define MEGDNN_VISIBILITY_PROLOGUE_INCLUDED +#endif + +#if MEGDNN_SHARED_LIB +#pragma GCC visibility push(default) +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/opr_result_defs.h b/dnn/include/megdnn/opr_result_defs.h new file mode 100644 index 00000000..53e6e4ab --- /dev/null +++ b/dnn/include/megdnn/opr_result_defs.h @@ -0,0 +1,40 @@ +/** + * \file dnn/include/megdnn/opr_result_defs.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include + +namespace megdnn { +namespace opr_result { + + struct Checksum { + uint32_t checksum; + union { + int32_t iv; + float fv; + } last_val; + + bool operator == (const Checksum &rhs) const { + return checksum == rhs.checksum && + last_val.iv == rhs.last_val.iv; + } + + bool operator != (const Checksum &rhs) const { + return !operator==(rhs); + } + }; + +} // namespace opr_result +} // namespace megdnn + + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs.h b/dnn/include/megdnn/oprs.h new file mode 100644 index 00000000..35342cac --- /dev/null +++ b/dnn/include/megdnn/oprs.h @@ -0,0 +1,21 @@ +/** + * \file dnn/include/megdnn/oprs.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs/cv.h" +#include "megdnn/oprs/general.h" +#include "megdnn/oprs/nn.h" +#include "megdnn/oprs/nn_int.h" +#include "megdnn/oprs/imgproc.h" +#include "megdnn/oprs/utils.h" +#include "megdnn/oprs/linalg.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/base.h b/dnn/include/megdnn/oprs/base.h new file mode 100644 index 00000000..d758c6d0 --- /dev/null +++ b/dnn/include/megdnn/oprs/base.h @@ -0,0 +1,268 @@ +/** + * \file dnn/include/megdnn/oprs/base.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/basic_types.h" + +#include "megdnn/internal/visibility_prologue.h" +namespace megdnn { + +class Handle; + +/** + * \brief base class for all operators + * + * This is an helper class. Users should not use OperatorBase directly. + * Operators should be created by handle->create_opr<>(). + * + * Each operator must provides the following constexpr values: + * + * * NR_INPUTS: number of input vars + * * NR_OUTPUTS: number of output vars + * * OPERATOR_TYPE: operator type as an enum + * + * If the operator has dynamic inputs or in_out param, the corresponding + * NR_INPUTS is -1. + * + * For an operator whose NR_INPUTS >= 0 and NR_OUTPUTS >= 0, the operator must + * also provide following methods: + * + * * void exec(_megdnn_in inputs..., _megdnn_tensor_out outputs..., + * _megdnn_workspace workspace) + * * void deduce_layout(const TensorLayout& inputs..., + * TensorLayout& outputs...) + * * size_t get_workspace_in_bytes(const TensorLayout &inputs..., + * const TensorLayout &outputs) + */ +class OperatorBase { +public: + explicit OperatorBase(Handle* handle) : m_handle(handle) {} + virtual ~OperatorBase(); + + //! get the handle from which this operator is created + Handle* handle() const { return m_handle; } + + //! whether this opr guarantees that its exec() is thread-safe + virtual bool is_thread_safe() const { return false; } + + /*! + * \brief set the tracker to be used with MegcoreAsyncErrorInfo + * + * Most operators do not have async errors so this function has a + * default empty implementation. + */ + virtual void set_error_tracker(void*) {} + +private: + Handle* m_handle; +}; + +namespace detail { +/** + * \brief AlgoSelectionStrategy is the advance information for selecting + * algo + */ +enum class AlgoSelectionStrategy { + HEURISTIC = 0, //!< heristic to select the algos + FAST_RUN = 1, + FULL_RUN = 2, +}; + +/*! + * \brief Abstract representation of an algorithm for implementing + * the operator + * + * All pointers to Algorithm should be allocated globally and usable + * across multiple megdnn handles, and they should not be freed by + * the caller. + */ +class Algorithm { +public: + /** + * \brief whether the execution result is + * reproducible across multiple runs. + */ + virtual bool is_reproducible() const = 0; + virtual const char* name() const = 0; + + //! a pointer to represent class type + virtual void* type() const { return nullptr; } + +protected: + ~Algorithm() = default; +}; + +/*! + * \brief define Algorithm and ExecutionPolicy for oprs that have + * multiple impl algos + * + * \tparam Opr the operator class + * \tparam nargs number of arguments + */ +template +class MultiAlgoOpr; + +//! base def +template +class MultiAlgoOpr { +public: + using Algorithm = detail::Algorithm; + /*! + * \brief get a string representation for current algorithm set; + * + * get_all_algorithms() may return different algorithms only if + * algorithm set name differs. This is used for checking cache + * validity. + */ + virtual const char* get_algorithm_set_name() const = 0; + + //! policy for executing the operator + struct ExecutionPolicy { + //! nullptr means using heuristic + Algorithm* algorithm = nullptr; + }; + + ExecutionPolicy& execution_policy() { return m_execution_policy; } + + const ExecutionPolicy& execution_policy() const { + return m_execution_policy; + } + +protected: + ~MultiAlgoOpr() = default; + +private: + ExecutionPolicy m_execution_policy; +}; + +//! specialize for nargs == 3 +template +class MultiAlgoOpr : public MultiAlgoOpr { +public: + using Algorithm = detail::Algorithm; + + //! get all possible algorithms for the specified layouts + virtual std::vector get_all_algorithms( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2) = 0; + + /** + * \brief Returns the best algorithm by heuristic. + * + * The selected algorithm should not use workspace more than + * \p workspace_limit_in_bytes. + */ + virtual Algorithm* get_algorithm_heuristic( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, + size_t workspace_limit_in_bytes = + std::numeric_limits::max(), + bool reproducible = false) = 0; + +protected: + ~MultiAlgoOpr() = default; +}; + +//! specializae for nargs == 4 +template +class MultiAlgoOpr : public MultiAlgoOpr { +public: + using Algorithm = detail::Algorithm; + + //! get all possible algorithms for the specified layouts + virtual std::vector get_all_algorithms( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, const TensorLayout& p3) = 0; + + /** + * \brief Returns the best algorithm by heuristic. + * + * The selected algorithm should not use workspace more than + * \p workspace_limit_in_bytes. + */ + virtual Algorithm* get_algorithm_heuristic( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, const TensorLayout& p3, + size_t workspace_limit_in_bytes = + std::numeric_limits::max(), + bool reproducible = false) = 0; + +protected: + ~MultiAlgoOpr() = default; +}; + +//! specializae for nargs == 5 +template +class MultiAlgoOpr : public MultiAlgoOpr { +public: + using Algorithm = detail::Algorithm; + + //! get all possible algorithms for the specified layouts + virtual std::vector get_all_algorithms( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, const TensorLayout& p3, + const TensorLayout& p4) = 0; + + /** + * \brief Returns the best algorithm by heuristic. + * + * The selected algorithm should not use workspace more than + * \p workspace_limit_in_bytes. + */ + virtual Algorithm* get_algorithm_heuristic( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, const TensorLayout& p3, + const TensorLayout& p4, + size_t workspace_limit_in_bytes = + std::numeric_limits::max(), + bool reproducible = false) = 0; + +protected: + ~MultiAlgoOpr() = default; +}; + +//! specializae for nargs == 8 +template +class MultiAlgoOpr : public MultiAlgoOpr { +public: + using Algorithm = detail::Algorithm; + + //! get all possible algorithms for the specified layouts + virtual std::vector get_all_algorithms( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, const TensorLayout& p3, + const TensorLayout& p4, const TensorLayout& p5, + const TensorLayout& p6, const TensorLayout& p7) = 0; + + /** + * \brief Returns the best algorithm by heuristic. + * + * The selected algorithm should not use workspace more than + * \p workspace_limit_in_bytes. + */ + virtual Algorithm* get_algorithm_heuristic( + const TensorLayout& p0, const TensorLayout& p1, + const TensorLayout& p2, const TensorLayout& p3, + const TensorLayout& p4, const TensorLayout& p5, + const TensorLayout& p6, const TensorLayout& p7, + size_t workspace_limit_in_bytes = + std::numeric_limits::max(), + bool reproducible = false) = 0; + +protected: + ~MultiAlgoOpr() = default; +}; +} // namespace detail +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/cv.h b/dnn/include/megdnn/oprs/cv.h new file mode 100644 index 00000000..b46ac2ac --- /dev/null +++ b/dnn/include/megdnn/oprs/cv.h @@ -0,0 +1,275 @@ +/** + * \file dnn/include/megdnn/oprs/cv.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/internal/opr_header_prologue.h" + +namespace megdnn { + +/** + * \brief This file contains CV operators, The layout is NHWC + */ + +class FlipBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(FlipBase, OperatorBase); + DEF_OPR_PARAM(Flip); + + protected: + void deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, const TensorLayout &dst); +}; + +class FlipForward : public FlipBase { + DEF_OPR_IMPL(FlipForward, FlipBase, 1, 1); + + public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Flip = FlipForward; + +class RotateBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(RotateBase, OperatorBase); + DEF_OPR_PARAM(Rotate); + + protected: + void deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, const TensorLayout &dst); +}; + +class RotateForward : public RotateBase { + DEF_OPR_IMPL(RotateForward, RotateBase, 1, 1); + + public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Rotate = RotateForward; + +class ROICopyBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(ROICopyBase, OperatorBase); + DEF_OPR_PARAM(ROICopy); + + protected: + void deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, const TensorLayout &dst); +}; + +class ROICopyForward : public ROICopyBase { + DEF_OPR_IMPL(ROICopyForward, ROICopyBase, 1, 1); + + public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using ROICopy = ROICopyForward; + +class CvtColorBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(CvtColorBase, OperatorBase); + DEF_OPR_PARAM(CvtColor); + + protected: + void deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, const TensorLayout &dst); +}; + +class CvtColorForward : public CvtColorBase { + DEF_OPR_IMPL(CvtColorForward, CvtColorBase, 1, 1); + + public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using CvtColor = CvtColorForward; + +/** + * \brief Applices an affine transformation + */ +class WarpAffineBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(WarpAffineBase, OperatorBase); + DEF_OPR_PARAM(WarpAffine); + + public: + using InterpolationMode = Param::InterpolationMode; + using BorderMode = Param::BorderMode; + protected: + void check_layout_fwd(const TensorLayout& src, const TensorLayout& trans, + const TensorLayout& dst); + std::string param_msg() const; + int get_real_coord(int p, int len); +}; + +class WarpAffineForward : public WarpAffineBase { + DEF_OPR_IMPL(WarpAffineForward, WarpAffineBase, 2, 1); + + public: + /** + * \param[in] src input tensor + * \param[in] trans transform matrix tensor + * \param[in] dst output tensor + * + * \warning src, trans, border_value, dst should be contiguous + * The size of trans is N * 2 * 3 + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in trans, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &trans, + const TensorLayout &dst) = 0; + + protected: + void check_exec(const TensorLayout &src, const TensorLayout &trans, + const TensorLayout &dst, size_t workspace_in_bytes); +}; +using WarpAffine = WarpAffineForward; + +class GaussianBlurBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(GaussianBlurBase, OperatorBase); + DEF_OPR_PARAM(GaussianBlur); + + protected: + void deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, const TensorLayout &dst); +}; + +class GaussianBlurForward : public GaussianBlurBase { + DEF_OPR_IMPL(GaussianBlurForward, GaussianBlurBase, 1, 1); + + public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using GaussianBlur = GaussianBlurForward; + +/** + * \brief Resize opr. + */ +class ResizeBase : public OperatorBase { + DEF_OPR_PARAM(Resize); + DEF_OPR_IMPL(ResizeBase, OperatorBase, 1, 1); + +public: + using InterpolationMode = Param::InterpolationMode; + +protected: + //! get origin coord + std::pair get_origin_coord(float scale, int size, int idx); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); +}; + +class ResizeForward : public ResizeBase { + DEF_OPR_IMPL(ResizeForward, ResizeBase, 1, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using Resize = ResizeForward; + +class ResizeBackward : public ResizeBase { + DEF_OPR_IMPL(ResizeBackward, ResizeBase, 1, 1); + +public: + virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorLayout& diff, + const TensorLayout& mat) = 0; + +protected: + void check_exec(const TensorLayout& diff, const TensorLayout& mat, + size_t workspace_in_bytes); +}; + +class SeparableFilterBase: public OperatorBase { + DEF_OPR_IMPL_CTOR(SeparableFilterBase, OperatorBase); + DEF_OPR_PARAM(SeparableFilter); + protected: + void deduce_layout_fwd(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + const TensorLayout &dst); +}; + +class SeparableFilterForward: public SeparableFilterBase { + DEF_OPR_IMPL(SeparableFilterForward, SeparableFilterBase, 3, 1); + public: + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter_x, + _megdnn_tensor_in filter_y, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + const TensorLayout &dst, size_t workspace_in_bytes); +}; +using SeparableFilter = SeparableFilterForward; + +} // namespace megdnn + +#include "megdnn/internal/opr_header_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/general.h b/dnn/include/megdnn/oprs/general.h new file mode 100644 index 00000000..559bcc3f --- /dev/null +++ b/dnn/include/megdnn/oprs/general.h @@ -0,0 +1,1269 @@ +/** + * \file dnn/include/megdnn/oprs/general.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/internal/opr_header_prologue.h" +#include "megdnn/thin/small_vector.h" + +namespace megdnn { + +/*! + * \brief standard element-wise operator + * + * Inputs must have same dtype, and their shapes must broadcastable into a final + * shape. They can have arbitrary layouts, but non-contiguous and non-broadcast + * layouts may harm performance seriously. + * + * Output dtype is the same as input dtype (note that even for compare oprs this + * is true, e.g. float == float returns value of float). Output layout must be + * contiguous. + */ +class ElemwiseForward: public OperatorBase { + DEF_OPR_PARAM(Elemwise); + DEF_OPR_IMPL(ElemwiseForward, OperatorBase, -1, 1); + + public: + using Mode = Param::Mode; + + //! information about a mode + struct ModeTrait { + uint32_t arity; //!< number of inputs needed + bool commutable; //!< whether arity == 2 and inputs commutable + bool allow_int; //!< whether int inputs allowed + bool allow_float; //!< whether float inputs allowed + const char* name; //!< name of the mode + + + ModeTrait(): + arity(0), commutable(0), allow_int(0), allow_float(0), + name(NULL) + {} + + //! get trait from a mode; this function is thread safe + static const ModeTrait& from_mode(Mode mode); + }; + + //! get trait of current mode + const ModeTrait& mode_trait() const { + return ModeTrait::from_mode(m_param.mode); + } + + /** + * \param[in] src input tensor + * \param[out] dst output tensor + * + * src and dst should have the same shape; + * layouts should be contiguous; + * the underlying data pointer can point to the same memory region for + * src and dst. + */ + virtual void exec(_megdnn_in const TensorNDArray &src, + _megdnn_tensor_out dst) = 0; + + //! deduce output shape (do not check whether arity matches) + static void deduce_shape( + const TensorShapeArray &src, + TensorShape &dst); + + static void deduce_format(const TensorFormatArray& src, + TensorFormat& dst); + + //! deduce output layout + void deduce_layout(const TensorLayoutArray &src, + TensorLayout &dst); + + protected: + //! throw exception if incorrect layout; broadcast input shape to + //! output shape + void check_layout_and_broadcast( + const TensorLayoutPtrArray &src, const TensorLayout &dst); + + private: + void check_dtype(DType dtype); +}; +using Elemwise = ElemwiseForward; + +/*! + * \brief compute ``x**a`` where ``a`` is a constant from the Param + * + * This opr is usually not directly accessible by the end user and it is created + * by mgb optimizer, aiming to work around numerical stability issues with pow. + * For example ``powf(x, 2.f)`` with ``x < 0`` in fast math mode may return NaN. + * + * Like elemwise, this opr supports arbitrary strides. But it should only be + * used with monotone strides. Input and output should have the same + * float-category dtype. + */ +class PowC : public OperatorBase { + DEF_OPR_PARAM(PowC); + DEF_OPR_IMPL(PowC, OperatorBase, 1, 1); + +public: + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst); + + //! compatible API for mgb; workspace is not used + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace) { + return exec(src, dst); + } + + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) { + // the impls should require no workspace; this can be later changed to a + // virtual function if this situation changes + return 0; + } + + void deduce_layout(const TensorLayout& src, TensorLayout& dst) { + dst.dtype = src.dtype; + dst.init_contiguous_stride(src); + } + +protected: + /*! + * Perform the computing where layouts have been verified. + * + * \p src can have arbitrary layout, and \p dst is contiguous. They have the + * same shape and dtype. + * + * The implementation should not access param(). It should check \p exp_f + * and \p exp_i for the exponent value. Exactly one of them would be + * non-null. + * + * Note: \p exp_f and \p exp_i must be dereferenced before dispatching any + * kernel. They are allocated on the caller's stack. + */ + virtual void do_exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + const float* exp_f, const int* exp_i) = 0; +}; + +/*! + * \brief modify a tensor inplace by adding another tensor to it + * + * dst and delta can have arbitrary layout but must have the same shape. + */ +class AddUpdateForward: public OperatorBase { + DEF_OPR_PARAM(AddUpdate); + DEF_OPR_IMPL(AddUpdateForward, OperatorBase, -1, 1); + + public: + virtual void exec( + _megdnn_tensor_inout dst, _megdnn_tensor_in delta) = 0; + + protected: + void check_exec(const TensorLayout &dst, const TensorLayout &delta); +}; +using AddUpdate = AddUpdateForward; + +class ReduceForward: public OperatorBase { + DEF_OPR_PARAM(Reduce); + DEF_OPR_IMPL(ReduceForward, OperatorBase, 1, 1); + + public: + using Mode = Param::Mode; + using DataType = Param::DataType; + + /** + * \param[in] src input tensor + * \param[out] dst output tensor + * + * src and dst should be contiguous. + * src and dst should be of the same shape for all dimensions except + * param().axis. + * the param().axis-th dimension shape for dst should be one. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Reduce = ReduceForward; + +class CumsumForward: public OperatorBase { + DEF_OPR_PARAM(Cumsum); + DEF_OPR_IMPL(CumsumForward, OperatorBase, 1, 1); + + public: + /** + * \param[in] src input tensor + * \param[out] dst output tensor + * + * src and dst should be contiguous. + * src and dst should have the same shape. + * + * The exclusive flag specifies whether the current element it taken + * into account when calculating results. + * + * The reverse flag specifies whether cumsum is forward ( + * from 0 to n) or backward (from n downto 0). + * + * Example: + * exclusive && reverse: + * dst_i = src_{i+1} + src_{i+2} + ... + src_{n-1} + * exclusive && !reverse + * dst_i = src_0 + src_1 + ... + src_{i-1} + * !exclusive && reverse: + * dst_i = src_i + src_{i+1} + ... + src_{n-1} + * !exclusive && !reverse: + * dst_i = src_0 + src_1 + ... + src_i + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Cumsum = CumsumForward; + +// mxx can be max or min +class ArgmxxBase: public OperatorBase { + DEF_OPR_IMPL_CTOR(ArgmxxBase, OperatorBase); + DEF_OPR_PARAM(Axis); + + protected: + void check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst); +}; + +class ArgmaxForward: public ArgmxxBase { + DEF_OPR_IMPL(ArgmaxForward, ArgmxxBase, 1, 1); + public: + /** + * \param[in] src input tensor + * \param[out] dst output tensor containing the argmax indices + * + * src and dst should be contiguous. + * src and dst should be of the same shape for all dimensions except + * param().axis. + * the param().axis-th dimension shape for dst should be one. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Argmax = ArgmaxForward; + +class ArgminForward: public ArgmxxBase { + DEF_OPR_IMPL(ArgminForward, ArgmxxBase, 1, 1); + public: + /** + * \param[in] src input tensor + * \param[out] dst output tensor containing the argmax indices + * + * src and dst should be contiguous. + * src and dst should be of the same shape for all dimensions except + * param().axis. + * the param().axis-th dimension shape for dst should be one. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Argmin = ArgminForward; + +/*! + * \brief take values from input according to given condition + * + * Output two tensors: + * 1. values copied from *data*, with same dtype as *data* + * 2. selected indices with dtype int32; note that it is 1-dimensional and + * based on the flatten input. + * + * Require data and mask to have the same shape and both be contiguous. + */ +class CondTake : public OperatorBase { + DEF_OPR_IMPL(CondTake, OperatorBase, 2, 2); + DEF_OPR_PARAM(CondTake); + +public: + using Output = std::array; + using OutputDType = std::array; + + OutputDType infer_dtype(DType data, DType mask); + + virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0; + + virtual Output exec(_megdnn_tensor_in data, _megdnn_tensor_in mask, + _megdnn_workspace workspace, + DynOutMallocPolicyCall malloc_policy) = 0; + +protected: + //! check input layouts and get flattened size + size_t check_exec_get_size(const TensorLayout& data, + const TensorLayout& mask, + size_t workspace_in_bytes); +}; + +class TransposeForward: public OperatorBase { + DEF_OPR_IMPL(TransposeForward, OperatorBase, 1, 1); + DEF_OPR_PARAM(Empty); + public: + /** + * \param[in] src (m, n) stride[0] >= n && stride[1] == 1 + * \param[out] dst (n, m) stride[0] >= m && stride[1] == 1 + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Transpose = TransposeForward; + +/** + * Change a tensor to another layout that has the same dtype and total number of + * elements, and non-overlapping stride. + * + * ON CPU: + * This operator is optimized for some cases(e.g. both dst and last dim of src + * are contiguous) + * + * ON CUDA: + * More contiguous the input/output layouts, higher performance. There is also + * special optimization for broadcast case. + */ +class RelayoutForward: public OperatorBase { + DEF_OPR_IMPL(RelayoutForward, OperatorBase, 1, 1); + DEF_OPR_PARAM(Empty); + public: + /*! + * \brief execute relayout opr + * + * This operator should be placed on the same computing device of *dst*. + * + * \param src_handle handle of input tensor; for CUDA d2d copy, the + * src handle can be on a different GPU for copy tensor with + * non-contig dims <= 2 + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + Handle *src_handle = nullptr) = 0; + protected: + //! check layout and collapse contiguous + void check_layout_and_canonize( + TensorLayout &src, TensorLayout &dst); +}; +using Relayout = RelayoutForward; + +/** + * \brief Base class for Concat and Split operators + */ +class ConcatSplitBase: public OperatorBase { + public: + using Param = param::Axis; + + ConcatSplitBase(Handle *handle); + const Param ¶m() const { return m_param; } + Param ¶m() { return m_param; } + protected: + void check_layout_common(const TensorLayoutArray &srcs, + const TensorLayout &dst); + Param m_param; + /** + * \brief a helper function + * + * A = shape[0] * shape[1] * ... * shape[axis-1] + * B = {srcs[0].shape[axis], srcs[1].shape[axis], ...} + * C = shape[axis+1] * shape[axis+2] * ... * shape[ndim-1] + */ + void get_ABC(const TensorShapeArray &srcs, + size_t &A, + size_t *B, + size_t &C); + thin_function m_get_layout; + thin_function m_get_shape; +}; + +class ConcatForward: public ConcatSplitBase { + DEF_OPR_IMPL(ConcatForward, ConcatSplitBase, 1, 1); + public: + /** + * \param[in] srcs a vector containing all inputs to be concatenated + * \param[out] dst the output tensor. + * + * All tensors in srcs and dst should be contiguous. + * All tensors should have the same shape for all axes except + * param().axis. + * For the param().axis-th axis, the axis shape for dst should be the + * sum of corresponding axis shapes for all srcs. + */ + virtual void exec(_megdnn_in const TensorNDArray &srcs, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayoutArray &srcs, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes( + const TensorLayoutArray &srcs, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayoutArray &srcs, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Concat = ConcatForward; + +class SplitForward: public ConcatSplitBase { + DEF_OPR_IMPL(SplitForward, ConcatSplitBase, 1, 1); + public: + /** + * \param[in] src input tensor + * \param[out] dsts a vector containing all splitted result + * + * All tensors in src and dsts should be contiguous. + * All tensors should have the same shape for all axes except + * param().axis. + * For the param().axis-th axis, the axis shape for src should be the + * sum of corresponding axis shapes for all dsts. + */ + virtual void exec(_megdnn_tensor_in src, + const TensorNDArray &dsts, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayoutArray &dsts) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayoutArray &dsts, + size_t workspace_in_bytes); +}; +using Split = SplitForward; + +/** + * \brief Base class for ParamPackConcat and ParamPackSplit Operators. + * + * ParamPack oprs act like Concat and Split, but they also are optimized for a + * large number of inputs and can handle alignment requirements. Axis is also + * not supported. + * + * The table can be generated by gen_table(). The \p srcs in ParamPackSplit and + * \p dsts in ParamPackConcat must be on CPU, and must remain valid until the + * execution stream is synchronized. + */ +class ParamPackConcatSplitBase : public OperatorBase { +protected: + void check_exec(const TensorLayout& concated, const TensorLayout& table, + const TensorLayout& parts); + +public: + using Param = megdnn::param::Empty; + ParamPackConcatSplitBase(Handle* handle) : OperatorBase(handle) {} + + //! generate table to be used with ParamPackConcat and ParamPackSplit + static std::vector gen_table(const TensorShapeArray& shapes, + size_t alignment, size_t dtype_size); +}; + +/** + * \brief ParamPackConcat, used for calculating gradient of ParamPackSplit + * Combine multiple gradient tensors into a single large tensor, use copy + * strategy due to AddUpdate or other dynamic situation. + */ +class ParamPackConcat: public ParamPackConcatSplitBase { + DEF_OPR_IMPL(ParamPackConcat, ParamPackConcatSplitBase, 2, 1); + +public: + /* + * \param[in] srcs: TensorND on cpu. srcs[i] corresponding to the + * address of i-th Tensor. + * \param[in] table: with size `2 * srcs.nr_total_elems()`. + * table[addr] corresponding to outer_idx, + * table[addr+srcs.nr_total_elems()] corresponding to + * inner_idx of dsts. + * \param[out] dst: output TensorND, live on cpu or gpu + */ + virtual void exec(_megdnn_tensor_in srcs, _megdnn_tensor_in table, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorShapeArray& srcs, + const TensorShape& table, + const TensorShape& dst) = 0; +}; + +/** + * \brief ParamPackSplit, used for network forwarding. + * Split a single large param into several small tensors, use copy stategy + * either. + */ +class ParamPackSplit: public ParamPackConcatSplitBase { + DEF_OPR_IMPL(ParamPackSplit, ParamPackConcatSplitBase, 2, 1); + +public: + /* + * \param[in] src: input TensorND, live on cpu or gpu + * \param[in] table: with size `2 * srcs.nr_total_elems()`. + * table[addr] corresponding to outer_idx, + * table[addr+srcs.nr_total_elems()] corresponding to + * inner_idx of dsts. + * \param[out] dsts: TensorND on cpu. dsts[i] corresponding to the address + * of i-th Tensor + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in table, + _megdnn_tensor_out dsts, _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorShape& src, + const TensorShape& table, + const TensorShapeArray& dsts) = 0; +}; + +/** + * \brief base class for Tile and Repeat + */ +class TileRepeatBase: public OperatorBase { + public: + TileRepeatBase(Handle *handle): OperatorBase(handle) {} + struct Param { + TensorShape times; + }; + Param ¶m() { return m_param; } + const Param ¶m() const { return m_param; } + protected: + void check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst); + void deduce_layout_fwd(const TensorLayout &src, + TensorLayout &dst); + /** + * Assuming src/dst/times are already simplified on entrance. + */ + size_t get_workspace_in_bytes_fwd(const TensorShape &src, + const TensorShape &dst, + const TensorShape ×, + DType dtype); + Param m_param; +}; + +class TileBase: public TileRepeatBase { + public: + TileBase(Handle *handle): TileRepeatBase(handle) {} + protected: + void simplify_shape(const TensorShape &src, + const TensorShape &dst, + const TensorShape ×, + TensorShape &src2, + TensorShape &dst2, + TensorShape ×2); + /** + * This is a helper function that would facilitate other backends' + * implementation. + */ + size_t get_workspace_in_bytes_fwd(const TensorLayout &src, + const TensorLayout &dst); +}; + +class TileForward: public TileBase { + DEF_OPR_IMPL(TileForward, TileBase, 1, 1); + public: + /** + * \brief Tile src times to get dst. + * \param[in] src input tensor + * \param[out] dst output tensor + * \param[out] workspace temporary workspace + * + * src and dst must be contiguous. + * dst.shape should be {src.shape[0]*param().times[0], + * src.shape[1]*param().times[1], ...} + * + * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html + * + * Difference between Tile and Repeat: + * Tiling `abc' twice yields `abcabc', whereas repeating `abc' twice + * yields `aabbcc'. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Tile = TileForward; + +class TileBackward: public TileBase { + DEF_OPR_IMPL(TileBackward, TileBase, 1, 1); + public: + /** + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. src + * \param[out] workspace temporary workspace + */ + virtual void exec(_megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &diff, + const TensorLayout &grad) = 0; + protected: + void check_exec(const TensorLayout &diff, const TensorLayout &grad, + size_t workspace_in_bytes); +}; + +class RepeatBase: public TileRepeatBase { + public: + RepeatBase(Handle *handle): TileRepeatBase(handle) {} + protected: + void simplify_shape(const TensorShape &src, + const TensorShape &dst, + const TensorShape ×, + TensorShape &src2, + TensorShape &dst2, + TensorShape ×2); + /** + * This is a helper function that would facilitate other backends' + * implementation. + */ + size_t get_workspace_in_bytes_fwd(const TensorLayout &src, + const TensorLayout &dst); +}; + +class RepeatForward: public RepeatBase { + DEF_OPR_IMPL(RepeatForward, RepeatBase, 1, 1); + public: + /** + * \brief Repeat src times to get dst. + * \param[in] src input tensor + * \param[out] dst output tensor + * \param[out] workspace temporary workspace + * + * src and dst must be contiguous. + * dst.shape should be {src.shape[0]*param().times[0], + * src.shape[1]*param().times[1], ...} + * + * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html + * \see TileForward + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using Repeat = RepeatForward; + +class RepeatBackward: public RepeatBase { + DEF_OPR_IMPL(RepeatBackward, RepeatBase, 1, 1); + public: + /** + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. src + * \param[out] workspace temporary workspace + */ + virtual void exec(_megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &diff, + const TensorLayout &grad) = 0; + protected: + void check_exec(const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes); +}; + +class ArgsortForward: public OperatorBase { + DEF_OPR_IMPL(ArgsortForward, OperatorBase, 1, 2); + DEF_OPR_PARAM(Argsort); + public: + using Order = Param::Order; + /** + * \param[in] src (m, n) + * \param[out] dst (m, n) + * \param[out] indices (m, n) + * + * src, dst and indices should be contiguous. + * Performing m independent sorting on m arrays of length n. + * Sorting arrays and storing the resulting array in `dst', + * and the corresponding indices in `indices'. + * + * Indices range from 0 to n-1. + * + * Note that indices is a TensorND of type int. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_tensor_out indices, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + TensorLayout &dst, + TensorLayout &indices); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &indices) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &indices, + size_t workspace_in_bytes); +}; +using Argsort = ArgsortForward; + +/*! + * \brief backward opr for Argsort + * + * Note: the name is kept for backward compatibility. This opr is actually a + * batched value setter. It is used for gradient computing of Argsort and TopK. + */ +class ArgsortBackward : public OperatorBase { + DEF_OPR_IMPL(ArgsortBackward, OperatorBase, 2, 1); + DEF_OPR_PARAM(Empty); + +public: + /** + * \param[in] diff (m, k) the backpropagated gradient wrt. dst + * \param[in] indices (m, k) the `indices' parameter in + * ArgsortForward::exec + * \param[out] grad (m, n) the backpropagated gradient wrt. src + * + * Constraint: n >= k. Untouched values would be initialized as zero. + */ + virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in indices, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& diff, + const TensorLayout& indices, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& diff, const TensorLayout& indices, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class TopK : public OperatorBase { + DEF_OPR_IMPL(TopK, OperatorBase, 1, 2); + DEF_OPR_PARAM(TopK); + +protected: + //! impl exec; inputs have been validated + virtual void do_exec(int k, _megdnn_tensor_in data, + _megdnn_tensor_out values, int32_t* indices, + _megdnn_workspace workspace) = 0; + +public: + /*! + * \param[in] k if positive, compute the smallest top-k values; otherwise + * compute the largest top-k values + * \param[in] data (m, n) input data, where top-k is computed on the + * second axis. The second dimension must be contiguous, and the first + * dimension can have arbitrary stride. + * \param[out] values (m, ) or (m, k) output values; its shape depends + * on mode + * \param[out] indices () or (m, ) or (m, k) output values; its shape + * depends on mode + */ + void exec(int k, _megdnn_tensor_in data, _megdnn_tensor_out values, + _megdnn_tensor_out indices, _megdnn_workspace workspace); + virtual size_t get_workspace_in_bytes(int k, const TensorLayout& data, + const TensorLayout& values, + const TensorLayout& indices) = 0; + + void deduce_layout(int k, const TensorLayout& data, TensorLayout& values, + TensorLayout& indices); +}; + +/*! + * \brief convert dtype of *src* to match dtype of *dst*; *src* may have + * arbitrary layout and *dst* must be contiguous. + */ +class TypeCvtForward: public OperatorBase { + DEF_OPR_PARAM(Empty); + DEF_OPR_IMPL(TypeCvtForward, OperatorBase, 1, 1); + public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) = 0; + protected: + void check_exec(const TensorLayout &src, const TensorLayout &dst); +}; +using TypeCvt = TypeCvtForward; + +class IndexingRemapBase: public OperatorBase { + public: + using Param = param::IndexingRemap; + + IndexingRemapBase(Handle *handle): OperatorBase(handle) {} + Param ¶m() { return m_param; } + const Param ¶m() const { return m_param; } + protected: + Param m_param; + void check_layout_fwd(const TensorLayout &src, + const TensorLayout &map, + const TensorLayout &dst); +}; + +class IndexingRemapForward: public IndexingRemapBase { + DEF_OPR_IMPL(IndexingRemapForward, IndexingRemapBase, 2, 1); + public: + /** + * \param[in] src input tensor + * \param[in] map input map + * \param[out] dst output tensor + * + * Suppose: + * the shape of src is \f$(s_0, s_1, ..., s_{m-1}\f$; + * the shape of dst is \f$(d_0, d_1, ..., d_{n-1})\f$; + * then: + * the shape of map must be \f$(d_0, d_1, ..., d_{n-1}, m)\f$. + * + * The last dimension of map indicates the src indices for the + * corresponding dst entry. + * + * src and dst can be non-contiguous in a non-overlapping manner. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_in map, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout &src, + const TensorLayout &map, + TensorLayout &dst); + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &map, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &map, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using IndexingRemap = IndexingRemapForward; +// The using directives preserve backward compatibility. +using TensorRemapForward = IndexingRemap; +using TensorRemap = TensorRemapForward; + +class IndexingRemapBackward: public IndexingRemapBase { + DEF_OPR_IMPL(IndexingRemapBackward, IndexingRemapBase, 2, 1); + public: + /** + * \param[in] diff the backpropagated gradient wrt. dst + * \param[in] map the `map' parameter in IndexingRemapForward::exec + * \param[out] grad the backpropagated gradient wrt. src + */ + virtual void exec(_megdnn_tensor_in diff, + _megdnn_tensor_in map, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &diff, + const TensorLayout &map, + const TensorLayout &grad) = 0; + protected: + void check_exec(const TensorLayout &diff, + const TensorLayout &map, + const TensorLayout &grad, + size_t workspace_in_bytes); +}; +// The using directives preserve backward compatibility. +using TensorRemapBackward = IndexingRemapBackward; + +class Linspace: public OperatorBase { + DEF_OPR_IMPL(Linspace, OperatorBase, 0, 1); + DEF_OPR_PARAM(LinspaceFull); + public: + /** + * \param[out] dst must be 1d. + * + * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html + */ + virtual void exec(_megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &dst, size_t workspace_in_bytes); +}; + +class Eye: public OperatorBase { + DEF_OPR_IMPL(Eye, OperatorBase, 0, 1); + DEF_OPR_PARAM(Eye); + public: + /** + * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.eye.html + */ + virtual void exec(_megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &dst, size_t workspace_in_bytes); +}; + +class IndexingOneHotBase: public OperatorBase { + DEF_OPR_IMPL_CTOR(IndexingOneHotBase, OperatorBase); + DEF_OPR_PARAM(Axis); + + protected: + void deduce_layout_fwd(const TensorLayout &src, + const TensorLayout &index, + TensorLayout &dst); + void check_layout_fwd(const TensorLayout &src, + const TensorLayout &index, + const TensorLayout &dst); +}; + +/*! + * \brief Indexing for one-hot encoding + * + * Given src, axis and index, + * for all valid (n-1)-dimensional subscript tuples i iterating through index: + * dst[i[0], ..., i[axis-1], 0, i[axis], ..., i[n-2]] = + * inp[i[0], ..., i[axis-1], index[i], i[axis], ..., i[n-2]] + * + * \param[in] src n-dimensional input data + * \param[in] index (n-1)-dimensional index, must be int + * \param[out] dst n-dimensional output data + */ +class IndexingOneHotForward: public IndexingOneHotBase { + DEF_OPR_IMPL(IndexingOneHotForward, IndexingOneHotBase, 2, 1); + + public: + void deduce_layout(const TensorLayout &src, + const TensorLayout &index, TensorLayout &dst) { + deduce_layout_fwd(src, index, dst); + } + + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_in index, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &index, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &index, const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using IndexingOneHot = IndexingOneHotForward; + +/*! + * \brief set-subtensor corresponding to IndexingOneHotForward + * + * \param[in,out] data n-dimensional input and output data, whose sub part + * corresponding to *index* would be replaced by *sub* + * \param[in] index (n-1)-dimensional index, must be int + * \param[in] sub n-dimensional sub tensor to be filled in *data* + */ +class IndexingSetOneHotForward: public IndexingOneHotBase { + DEF_OPR_IMPL(IndexingSetOneHotForward, IndexingOneHotBase, -1, 1); + + public: + virtual void exec(_megdnn_tensor_inout data, _megdnn_tensor_in index, + _megdnn_tensor_in sub, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &data, + const TensorLayout &index, + const TensorLayout &sub) = 0; + protected: + void check_exec(const TensorLayout &data, + const TensorLayout &index, const TensorLayout &sub, + size_t workspace_in_bytes); +}; +using IndexingSetOneHot = IndexingSetOneHotForward; + +/*! + * \brief base class for indexing on multiple axes using vector indices + * + * Note that the indexing axes are required to be sorted in ascending order + */ +class IndexingMultiAxisVecBase: public OperatorBase { + DEF_OPR_IMPL_CTOR(IndexingMultiAxisVecBase, OperatorBase); + DEF_OPR_PARAM(Empty); + + public: + struct AxisIndexer { + size_t axis; + TensorND vec; + }; + + struct AxisIndexerLayoutOnly { + size_t axis; + TensorLayout layout; + }; + + using IndexDesc = std::vector; + using IndexDescLayoutOnly = std::vector; + + /*! + * \brief convert IndexDesc to IndexDescLayoutOnly + */ + static IndexDescLayoutOnly extract_index_layout(const IndexDesc &index); + + /*! + * \brief get the axes on src that are not used in index + * \param[out] out output buffer; suggested size is + * TensorLayout::MAX_NDIM + * \return number of elements written to *out* + */ + static size_t get_nonindex_axes(size_t src_ndim, const IndexDesc &index, + size_t *out); + + /*! + * \brief get contiguous-collapsed layout for indexing on value + * \param idx_axis indexer axis on value (i.e. ExecInfo::idx_axis) + * \return a tensor layout and an axis to iterate over *value* and also + * access *data*; stride of layout on that axis would be zero, and + * strides on other axes correspond to the strides in *data* + */ + static std::pair get_value_iter_optimized_layout( + const TensorLayout &data, const TensorLayout &value, + const IndexDesc &index, size_t idx_axis); + + //! helper info for kernel implementation + struct ExecInfo { + //! axis in value used by indexer + size_t idx_axis; + ptrdiff_t value_stride; + + void* error_tracker; + megcore::AsyncErrorInfo* error_info; + }; + + protected: + /*! + * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) + */ + static size_t deduce_layout_fwd( + const TensorLayout &data, + const IndexDescLayoutOnly &index, + TensorLayout &dst); + + static ExecInfo check_exec_noworkspace( + const TensorLayout &data, const TensorLayout &value, + const IndexDesc &index, IndexDescLayoutOnly &index_layout); +}; + +/*! + * \brief compute indexing result, like numpy advanced indexing + * + * src can have arbitrary layout, but dst must be dim1-contig + */ +class IndexingMultiAxisVec: public IndexingMultiAxisVecBase { + DEF_OPR_IMPL(IndexingMultiAxisVec, IndexingMultiAxisVecBase, 0, 1); + + public: + virtual void exec(_megdnn_tensor_in src, + const IndexDesc &index, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + + /*! + * \brief get workspace size based on output shape and indexing axes + */ + size_t get_workspace_in_bytes( + const TensorShape &dst, + const size_t *axes, size_t nr_axes); + + static void deduce_layout( + const TensorLayout &data, + const IndexDescLayoutOnly &index, + TensorLayout &dst) { + deduce_layout_fwd(data, index, dst); + } + protected: + + virtual size_t get_workspace_in_bytes(size_t dst_idx_size) = 0; + + ExecInfo check_exec( + const TensorLayout &src, + const IndexDesc &index, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; + +/*! + * \brief base class for modifying data by given index + * + * data can have arbitrary layout, but value must be dim1-contig + */ +class IndexingModifyMultiAxisVecBase: public IndexingMultiAxisVecBase { + DEF_OPR_IMPL_CTOR(IndexingModifyMultiAxisVecBase, IndexingMultiAxisVecBase); + + public: + virtual void exec( + _megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc &index, + _megdnn_workspace workspace) = 0; + + /*! + * \brief get workspace size based on shape of value input and indexing + * axes + */ + size_t get_workspace_in_bytes( + const TensorShape &value, + const size_t *axes, size_t nr_axes); + + protected: + ExecInfo check_exec( + const TensorLayout &data, const TensorLayout &value, + const IndexDesc &index, + size_t workspace_in_bytes); + + virtual size_t get_workspace_in_bytes(size_t value_idx_size) = 0; +}; + +//! set value to indexed locations; index values must be non-overlapping +class IndexingSetMultiAxisVec: public IndexingModifyMultiAxisVecBase { + DEF_OPR_IMPL(IndexingSetMultiAxisVec, + IndexingModifyMultiAxisVecBase, 0, 0); +}; + +//! add value to indexed locations; index values must be non-overlapping +class IndexingIncrMultiAxisVec: public IndexingModifyMultiAxisVecBase { + DEF_OPR_IMPL(IndexingIncrMultiAxisVec, + IndexingModifyMultiAxisVecBase, 0, 0); +}; + +class MeshBase : public OperatorBase { + DEF_OPR_PARAM(Empty); + DEF_OPR_IMPL_CTOR(MeshBase, OperatorBase); + +public: + using AxisIndexer = IndexingMultiAxisVecBase::AxisIndexer; + using IndexDesc = IndexingMultiAxisVecBase::IndexDesc; + using AxisIndexerLayoutOnly = + IndexingMultiAxisVecBase::AxisIndexerLayoutOnly; + using IndexDescLayoutOnly = IndexingMultiAxisVecBase::IndexDescLayoutOnly; + + size_t get_workspace_in_bytes(const TensorShape&, const size_t*, size_t) { + return 0; + } + +protected: + virtual void check_exec(const TensorLayout& origin, + const TensorLayout& indexed, const IndexDesc& desc); +}; + +class NormalMeshBase : public MeshBase { + DEF_OPR_IMPL(NormalMeshBase, MeshBase, 0, 0); + +protected: + virtual void check_exec(const TensorLayout& origin, + const TensorLayout& indexed, + const IndexDesc& desc) override final; +}; + +class NormalMeshModifyBase : public NormalMeshBase { + DEF_OPR_IMPL_CTOR(NormalMeshModifyBase, NormalMeshBase); + +public: + virtual void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc& desc, _megdnn_workspace workspace) = 0; +}; + +class BatchedMeshBase : public MeshBase { + DEF_OPR_IMPL_CTOR(BatchedMeshBase, MeshBase); + +protected: + virtual void check_exec(const TensorLayout& origin, + const TensorLayout& indexed, + const IndexDesc& desc) override final; +}; + +class BatchedMeshModifyBase : public BatchedMeshBase { + DEF_OPR_IMPL_CTOR(BatchedMeshModifyBase, BatchedMeshBase); + +public: + virtual void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc& desc, _megdnn_workspace workspace) = 0; +}; + +class MeshIndexing : public NormalMeshBase { + DEF_OPR_IMPL(MeshIndexing, NormalMeshBase, 0, 0); + +public: + virtual void exec(_megdnn_tensor_in src, const IndexDesc& desc, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + + static void deduce_layout(const TensorLayout& inp, + const IndexDescLayoutOnly& layouts, + TensorLayout& out_layout); +}; + +class IncrMeshIndexing : public NormalMeshModifyBase { + DEF_OPR_IMPL(IncrMeshIndexing, NormalMeshModifyBase, 0, 0); +}; + +class SetMeshIndexing : public NormalMeshModifyBase { + DEF_OPR_IMPL(SetMeshIndexing, NormalMeshModifyBase, 0, 0); +}; + +class BatchedMeshIndexing : public BatchedMeshBase { + DEF_OPR_IMPL(BatchedMeshIndexing, BatchedMeshBase, 0, 0); + +public: + virtual void exec(_megdnn_tensor_in src, const IndexDesc& desc, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + + static void deduce_layout(const TensorLayout& inp, + const IndexDescLayoutOnly& layouts, + TensorLayout& out_layout); +}; + +class BatchedIncrMeshIndexing : public BatchedMeshModifyBase { + DEF_OPR_IMPL(BatchedIncrMeshIndexing, BatchedMeshModifyBase, 0, 0); +}; + +class BatchedSetMeshIndexing : public BatchedMeshModifyBase { + DEF_OPR_IMPL(BatchedSetMeshIndexing, BatchedMeshModifyBase, 0, 0); +}; + +class RelayoutFormat : public OperatorBase { + DEF_OPR_PARAM(RelayoutFormat); + DEF_OPR_IMPL(RelayoutFormat, OperatorBase, 1, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& dst); + void deduce_format(TensorFormat src, TensorFormat& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) = 0; + +protected: + void deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst); + + void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); + + void check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes); + void deduce_exec_layout(const TensorLayout& src, const TensorLayout& dst, + TensorLayout& exec_src, TensorLayout& exec_dst); +}; +} // namespace megdnn + +#include "megdnn/internal/opr_header_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/imgproc.h b/dnn/include/megdnn/oprs/imgproc.h new file mode 100644 index 00000000..0f1c1334 --- /dev/null +++ b/dnn/include/megdnn/oprs/imgproc.h @@ -0,0 +1,153 @@ +/** + * \file dnn/include/megdnn/oprs/imgproc.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/internal/opr_header_prologue.h" + +namespace megdnn { + +class WarpPerspectiveBase: public OperatorBase { + DEF_OPR_IMPL_CTOR(WarpPerspectiveBase, OperatorBase); + DEF_OPR_PARAM(WarpPerspective); + public: + using InterpolationMode = Param::InterpolationMode; + using BorderMode = Param::BorderMode; + + protected: + void check_layout_fwd(const TensorLayout &src, const TensorLayout &mat, + const TensorLayout &dst) { + check_layout_fwd(src, mat, {}, dst); + } + + void check_layout_fwd(const TensorLayout &src, const TensorLayout &mat, + const TensorLayout &mat_idx, const TensorLayout &dst); + std::string param_msg() const; + int get_real_coord(int p, int len); +}; + +class WarpPerspectiveForward: public WarpPerspectiveBase { + DEF_OPR_IMPL(WarpPerspectiveForward, WarpPerspectiveBase, 0, 1); + public: + /** + * \param[in] src (n, channel, in_height, in_width) + * \param[in] mat (n, 3, 3) + * \param[out] dst (n, channel, out_height, out_width) + * + * \see http://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html?highlight=warpaffine + * + * denominator = mat[2][0]*w+mat[2][1]*h+mat[2][2] + * dst(h, w) = src((mat[1][0]*w+mat[1][1]*h+mat[1][2])/denominator, + * (mat[0][0]*w+mat[0][1]*h+mat[0][2])/denominator) + * + * src and dst can have different shapes, as long as their n and c agree. + * src, mat and dst should be contiguous. + */ + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in mat, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + exec(src, mat, {}, dst, workspace); + } + + /** + * \p src should have batch size m, and \p mat and \p mat_idx should + * both have batch size n. Each item in \p mat_idx must be in the range + * of [0, m-1]. + * + * \param mat_idx the indices of input image that each matrix in \p mat + * should act on. It can also be empty and in such case \p mat + * should have the same batch size as \p src. + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_in mat, + _megdnn_tensor_in mat_idx, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &dst) { + return get_workspace_in_bytes(src, mat, {}, dst); + } + + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &mat_idx, + const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &mat_idx, + const TensorLayout &dst, + size_t workspace_in_bytes); + + void check_exec_allow_nhwc_mat_idx(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &mat_idx, + const TensorLayout &dst, + size_t workspace_in_bytes); +}; +using WarpPerspective = WarpPerspectiveForward; + +class WarpPerspectiveBackwardData: public WarpPerspectiveBase { + DEF_OPR_IMPL(WarpPerspectiveBackwardData, WarpPerspectiveBase, 2, 1); + public: + /** + * \param[in] mat the `mat' parameter in WarpPerspectiveForward::exec + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. src + * \param[out] workspace temporary workspace to perform backward + */ + virtual void exec(_megdnn_tensor_in mat, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &mat, + const TensorLayout &diff, + const TensorLayout &grad) = 0; + protected: + void check_exec(const TensorLayout &mat, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes); +}; + +class WarpPerspectiveBackwardMat: public WarpPerspectiveBase { + DEF_OPR_IMPL(WarpPerspectiveBackwardMat, WarpPerspectiveBase, 3, 1); + public: + /** + * \param[in] src the `src' parameter in WarpPerspectiveForward::exec + * \param[in] mat the `mat' parameter in WarpPerspectiveForward::exec + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. mat + * \param[out] workspace temporary workspace to perform backward + */ + virtual void exec(_megdnn_tensor_in src, + _megdnn_tensor_in mat, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &diff, + const TensorLayout &grad) = 0; + protected: + void check_exec(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes); +}; + +} // namespace megdnn + +#include "megdnn/internal/opr_header_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/linalg.h b/dnn/include/megdnn/oprs/linalg.h new file mode 100644 index 00000000..78672a75 --- /dev/null +++ b/dnn/include/megdnn/oprs/linalg.h @@ -0,0 +1,212 @@ +/** + * \file dnn/include/megdnn/oprs/linalg.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/internal/opr_header_prologue.h" + +namespace megdnn { + +class BatchedMatrixMulForward + : public OperatorBase, + public detail::MultiAlgoOpr { + DEF_OPR_PARAM(MatrixMul); + DEF_OPR_IMPL(BatchedMatrixMulForward, OperatorBase, 2, 1); + +public: + /** + * \brief C = op(A) * op(B) + * \param A (B, m, k) if transposeA is false, (B, k, m) otherwise + * \param B (B, k, n) if transposeB is false, (B, n, k) otherwise + * \param C (B, m, n) + * + * A, B, C must be 3-dimensional and C must be contiguous. A and B must + * have stride[2] == 1, and stride[1] >= shape[2], + * and stride[0] >= shape[1] * stride[1] + * + * op(A) = A if transposeA is false, otherwise op(A) = A^t. + * op(B) = B if transposeB is false, otherwise op(B) = B^t. + */ + virtual void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, + _megdnn_tensor_out C, _megdnn_workspace workspace) = 0; + void deduce_dtype(DType A, DType B, DType &C); + void deduce_layout(const TensorLayout& A, const TensorLayout& B, + TensorLayout& C); + virtual size_t get_workspace_in_bytes(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) = 0; + +protected: + void check_exec(const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C, size_t workspace_in_bytes); +}; +using BatchedMatrixMul = BatchedMatrixMulForward; + +class MatrixMulForward : public OperatorBase, + public detail::MultiAlgoOpr { + DEF_OPR_PARAM(MatrixMul); + DEF_OPR_IMPL(MatrixMulForward, OperatorBase, 2, 1); + +public: + /** + * \brief C = op(A) * op(B) + * \param A (m, k) if transposeA is false, (k, m) otherwise + * \param B (k, n) if transposeB is false, (n, k) otherwise + * \param C (m, n) + * + * A, B, C must be 2-dimensional and C must be contiguous. A and B must + * have stride[1] == 1, and stride[0] >= shape[1] + * + * op(A) = A if transposeA is false, otherwise op(A) = A^t. + * op(B) = B if transposeB is false, otherwise op(B) = B^t. + */ + virtual void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, + _megdnn_tensor_out C, _megdnn_workspace workspace) = 0; + void deduce_dtype(DType A, DType B, DType& C); + void deduce_layout(const TensorLayout& A, const TensorLayout& B, + TensorLayout& C); + virtual size_t get_workspace_in_bytes(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) = 0; + + static size_t pack_size (const Param::Format format); +protected: + void check_exec(const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C, size_t workspace_in_bytes); +}; +using MatrixMul = MatrixMulForward; + +/*! + * \brief compute the inverse of a batch of matrices + * + * Input and output tensors have the same shape [..., n, n] where the last two + * dimensions represent the matrices. + * + * Currently only float32 is supported. + */ +class MatrixInverse : public OperatorBase { + DEF_OPR_IMPL(MatrixInverse, OperatorBase, 1, 1); + DEF_OPR_PARAM(Empty); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& dst); + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst); + +protected: + /*! + * \brief get canonized params; throw exception on error. + * + * Note that \p batch and \p n can be null + */ + static void canonize_params(const TensorLayout& layout, size_t* batch, + size_t* n); + + /*! + * \brief canonize and validate input params for exec() impls + * + * Since get_workspace_in_bytes() would be called, \p batch and \p n can not + * be null + */ + void check_exec(const TensorLayout& src, const TensorLayout& dst, + _megdnn_workspace workspace, size_t* batch, size_t* n); + + virtual size_t get_workspace_in_bytes(size_t batch, size_t n, + size_t dtype_size) = 0; +}; + +//! inter-product of two vectors +class DotForward : public OperatorBase { + DEF_OPR_PARAM(Empty); + DEF_OPR_IMPL(DotForward, OperatorBase, 2, 1); + +public: + /** + * \param[in] A + * \param[in] B + * \param[out] C + * + * Calculating the dot product of A and B and store it in C. + * A, B, C must be contiguous. A and B must have the same 1-dimensional + * shape and non-negative strides. C must be scalar. + */ + virtual void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, + _megdnn_tensor_out C, _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& A, const TensorLayout& B, + TensorLayout& C); + virtual size_t get_workspace_in_bytes(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) = 0; + +protected: + void check_exec(const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C, size_t workspace_in_bytes); +}; +using Dot = DotForward; + +/*! + * \brief Compute the singular value decomposition of a batch of matrices + * + * Input tensors have the shape [..., m, n], where the last two + * dimensions represent the matrices. For the output tensor u, s, vt, + * the following equation holds: u * diag(s) * vt == src. + * + * Currently only float32 is supported. + */ +class SVDForward : public OperatorBase { + DEF_OPR_IMPL(SVDForward, OperatorBase, 1, 3); + DEF_OPR_PARAM(SVD); + +public: + /** + * \brief u, s, vt = SVD(src) and u * diag(s) * vt == src + * \param src (..., m, n) The input tensor, let p = min(m, n) + * \param u (..., m, p) if full_matrices is false, + (..., m, m) if full_matrices is true, + empty tensor if compute_uv is false. + The left singular vector. + + * \param s (..., p) The singular values. + * \param vt (..., p, n) if full_matrices is false, + (..., n, n) if full_matrices is true, + empty tensor if compute_uv is false. + The right singular vector. + * + * src must be contiguous. The computation might be significantly faster + * if compute_uv is false (default to true). + * + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out u, + _megdnn_tensor_out s, _megdnn_tensor_out vt, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& u, + TensorLayout& s, TensorLayout& vt); + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& u, const TensorLayout& s, + const TensorLayout& vt); + +protected: + static void canonize_params(const TensorLayout& layout, size_t* batch, + size_t* m, size_t* n); + virtual size_t get_workspace_in_bytes(size_t block_cnt, size_t m, size_t n, + size_t dtype_size) = 0; + void check_exec(const TensorLayout& src, const TensorLayout& u, + const TensorLayout& s, const TensorLayout& vt, + size_t workspace_in_bytes); +}; + +using SVD = SVDForward; + +} // namespace megdnn + +#include "megdnn/internal/opr_header_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h new file mode 100644 index 00000000..05cfd5bb --- /dev/null +++ b/dnn/include/megdnn/oprs/nn.h @@ -0,0 +1,1443 @@ +/** + * \file dnn/include/megdnn/oprs/nn.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/internal/opr_header_prologue.h" + +namespace megdnn { + +class SeparableConvBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(SeparableConvBase, OperatorBase); + DEF_OPR_PARAM(SeparableConv); + +public: + using Mode = Param::Mode; + +protected: + void deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& filter_x, + const TensorLayout& filter_y, TensorLayout& dst); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& filter_x, + const TensorLayout& filter_y, + const TensorLayout& dst); +}; + +class SeparableConvForward : public SeparableConvBase { + DEF_OPR_IMPL(SeparableConvForward, SeparableConvBase, 3, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter_x, + _megdnn_tensor_in filter_y, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, const TensorLayout& filter_x, + const TensorLayout& filter_y, TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter_x, + const TensorLayout& filter_y, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& filter_x, + const TensorLayout& filter_y, const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using SeparableConv = SeparableConvForward; + +/** + * \brief base class for convolution operation + * + * This operator is supposed to perform convolution on arbitrary input + * dimensions. The input/output format is N, C, dims..., and kernel format can + * take two forms: + * 1. OC, IC, dims..., for conventional dense convolution + * 2. GROUP, OC_PER_GRP, IC_PER_GRP, dims... for sparse group convolution + * + * Currently, only 2D images are supported. + */ +template +class ConvolutionBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(ConvolutionBase, OperatorBase); + using Param = Parameter; + +public: + Param& param() { return m_param; } + const Param& param() const { return m_param; } + +protected: + Param m_param; + +public: + static constexpr size_t MAX_SPATIAL_DIM = 2; + using Mode = typename Param::Mode; + struct CanonizedFilterMeta { + DType dtype; + typename Param::Format format; + + uint32_t + //! whether filter should be flipped (i.e. is CONVOLUTION) + should_flip, + group, //!< number of groups + icpg, //!< input channels per group + ocpg, //!< output channels per group + spatial_ndim, stride[MAX_SPATIAL_DIM], padding[MAX_SPATIAL_DIM], + //! spatial dim + spatial[MAX_SPATIAL_DIM], dilation[MAX_SPATIAL_DIM], + //! spatial dim with dilation applied + dilated_spatial[MAX_SPATIAL_DIM]; + + //! T should be a ConvolutionBase::CanonizedFilterMeta + template + void copy_from(const T& b) { + dtype = b.dtype; + format = b.format; + should_flip = b.should_flip; + group = b.group; + icpg = b.icpg; + ocpg = b.ocpg; + spatial_ndim = b.spatial_ndim; + memcpy(stride, b.stride, sizeof(stride)); + memcpy(padding, b.padding, sizeof(padding)); + memcpy(spatial, b.spatial, sizeof(spatial)); + memcpy(dilation, b.dilation, sizeof(dilation)); + memcpy(dilated_spatial, b.dilated_spatial, sizeof(dilated_spatial)); + } + + bool operator==(const CanonizedFilterMeta& b) const { + bool flag = true; + flag = flag && (format == b.format); + flag = flag && (dtype == b.dtype); + flag = flag && (should_flip == b.should_flip); + flag = flag && (group == b.group); + flag = flag && (icpg == b.icpg); + flag = flag && (ocpg == b.ocpg); + flag = flag && (spatial_ndim == b.spatial_ndim); + if (flag) { + for (uint32_t i = 0; i < spatial_ndim; ++i) { + flag = flag && (stride[i] == b.stride[i]); + flag = flag && (padding[i] == b.padding[i]); + flag = flag && (spatial[i] == b.spatial[i]); + flag = flag && (dilation[i] == b.dilation[i]); + flag = flag && (dilated_spatial[i] == b.dilated_spatial[i]); + } + } + return flag; + } + }; + +protected: + // Check or deduce output DType + void check_or_deduce_dtype_fwd(DType src, DType filter, DType& dst) const; + CanonizedFilterMeta deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) const; + CanonizedFilterMeta check_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) const; + + CanonizedFilterMeta make_canonized_filter_meta( + size_t src_ndim, const TensorLayout& filter) const; +}; + +class MaskPropagate : public OperatorBase { + DEF_OPR_IMPL(MaskPropagate, OperatorBase, 1, 1); + DEF_OPR_PARAM(MaskPropagate); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) = 0; + + void deduce_layout(const TensorLayout& src, TensorLayout& dst); +}; + +/** + * \brief ConvolutionForward Operator with 0/1 Mask matrix + */ +class MaskConvForward : public ConvolutionBase { + DEF_OPR_IMPL(MaskConvForward, ConvolutionBase, 3, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in mask, _megdnn_tensor_out dst, + _megdnn_workspace worksapce) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& mask, + const TensorLayout& dst) = 0; + + void deduce_dtype(DType src, DType filter, DType mask, DType& dst); + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& mask, TensorLayout& dst); + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& mask, + const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using MaskConvolution = MaskConvForward; + +/** + * \brief ConvolutionForward operator. + */ +class ConvolutionForward : public ConvolutionBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(ConvolutionForward, ConvolutionBase, 2, 1); + +public: + /** + * \param[in] src (n, ic, ih, iw) + * \param[in] filter (oc, ic, fh, fw) + * \param[out] dst (n, oc, oh, ow) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + void deduce_dtype(DType src, DType filter, DType& dst); + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) = 0; + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using Convolution = ConvolutionForward; + +/** + * \brief ConvolutionBackwardData operator. + * + * Calculating the gradient wrt. convolution input data. + */ +class ConvolutionBackwardData + : public ConvolutionBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(ConvolutionBackwardData, ConvolutionBase, 2, 1); + +public: + /** + * \param[in] filter (oc, ic, fh, fw) + * \param[in] diff (n, oc, oh, ow) + * \param[out] grad (n, ic, ih, iw) + */ + virtual void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + + void deduce_dtype(DType filter, DType diff, DType& grad); + void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, + TensorLayout& grad); + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +/** + * \brief ConvolutionBackwardFilter operator. + * + * Calculating the gradient wrt. convolution filter. + */ +class ConvolutionBackwardFilter + : public ConvolutionBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(ConvolutionBackwardFilter, ConvolutionBase, 2, 1); + +public: + /** + * \param[in] src (n, ic, ih, iw) + * \param[in] diff (n, oc, oh, ow) + * \param[out] grad (oc, ic, fh, fw) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +/** + * \brief ConvolutionBias operator + */ +class ConvBiasForward : public ConvolutionBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(ConvBiasForward, ConvolutionBase, 4, 1); + +public: + /** + * \param[in] src (n, ic, ih, iw) or (n, ih, iw, ic) + * \param[in] filter (oc, ic, fh, fw) or (oc, fh, fw, ic) or (oc/4, fh, fw, + * 4*ic) \param[in] bias (1, oc, 1, 1) \param[in] z same as dst \param[out] + * dst (n, oc, oh, ow) or (n, oh, ow, oc) + * + * \note if the format is NCHW_WINOGRAD, the filter layout is (alphah, + * alphaw, oc, ic) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + TensorLayout& dst); + + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) = 0; + enum class BiasMode : uint32_t { + NO_BIAS = 0, //!< no bias + BROADCAST_CHANNEL_BIAS, //!< broadcast channel bias, [1, c, 1, 1] + BIAS //!< [N, C, H, W] + }; + + //! param for winograd algos. + struct WinogradParam { + uint32_t channel_block_size; + uint32_t output_block_size; + uint32_t tile_size; + bool operator==(const WinogradParam& rhs) const { + return channel_block_size == rhs.channel_block_size && + output_block_size == rhs.output_block_size && + tile_size == rhs.tile_size; + } + + std::string to_string() const; + }; + static constexpr WinogradParam INVALID_WINOGRAD_PARAM = {0, 0, 0}; + + struct DirectParam { + std::string to_string() const { return ""; } + }; + + struct MatmulParam { + std::string to_string() const { return ""; } + }; + + struct DefaultParam { + std::string to_string() const { return ""; } + }; + + //! get algo name, the format is ParamTrait::category:base:p.to_string() + //! \warning: base must not contain :. + template + static std::string algo_name(const std::string& base, const T& p); + /*! + * \brief parse algo_name and get WinogradParam from algo name. + * + * \param algo name string + * \return WinogradParam parsed from algo name, use pattern + * winograd:base:m:tile_size. + * + * \warning: INVALID_WINOGRAD_PARAM returns if the algo_name is not matched. + */ + static WinogradParam parse_winograd_name(const std::string& algo_name); + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using ConvBias = ConvBiasForward; + +/** + * \brief base class for Conv - Nonline - Pooling + */ +class ConvPoolingBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(ConvPoolingBase, OperatorBase); + + /** + * \ Param::Method: Two methods to fetch the input data. + * Default methods is WITH_TEXTURE_OBJ. + * If you want to use WITH_SHARED_MEM mode, + * please make sure that the size of + * [ all of the fliter kernels + a channel + * of input data + a channel of output data] + * should be no large than 38KB. + * And the pooling mode should not be "MAX". + */ + DEF_OPR_PARAM(ConvPooling); + +protected: + virtual void deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, TensorLayout& dst) = 0; + virtual void check_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, TensorLayout& dst, + size_t workspace_limit_in_bytes) = 0; +}; + +class ConvPoolingForward : public ConvPoolingBase { + DEF_OPR_IMPL(ConvPoolingForward, ConvPoolingBase, 2, 1); + +public: + /** + * \param[in] src input tensor + * \param[out] dst output tensor + */ + virtual void exec(const _megdnn_in TensorND src, + const _megdnn_in TensorND filter, + const _megdnn_in TensorND bias, _megdnn_out TensorND dst, + _megdnn_out Workspace workspace) = 0; + virtual void deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, TensorLayout& dst) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& dst) = 0; + +protected: + virtual void check_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, TensorLayout& dst, + size_t workspace_limit_in_bytes) = 0; +}; +using ConvPooling = ConvPoolingForward; + +class GroupLocalBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(GroupLocalBase, OperatorBase); + DEF_OPR_PARAM(Convolution); + +public: + using Mode = Param::Mode; + +protected: + void deduce_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst); +}; + +class GroupLocalForward : public GroupLocalBase { + DEF_OPR_IMPL(GroupLocalForward, GroupLocalBase, 2, 1); + +public: + /** + * \param[in] src (N, IC, IH, IW) + * \param[in] filter (G, OH, OW, IC/G, FH, FW, OC/G) + * \param[out] dst (N, OC, OH, OW) + **/ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); + } + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_in_bytes); +}; +using GroupLocal = GroupLocalForward; + +class GroupLocalBackwardData : public GroupLocalBase { + DEF_OPR_IMPL(GroupLocalBackwardData, GroupLocalBase, 2, 1); + +public: + virtual void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class GroupLocalBackwardFilter : public GroupLocalBase { + DEF_OPR_IMPL(GroupLocalBackwardFilter, GroupLocalBase, 2, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class Images2NeibsBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(Images2NeibsBase, OperatorBase); + DEF_OPR_PARAM(Images2Neibs); + +protected: + void deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst); + void check_layout_fwd(const TensorLayout& filter, const TensorLayout& dst); +}; + +class Images2NeibsForward : public Images2NeibsBase { + DEF_OPR_IMPL(Images2NeibsForward, Images2NeibsBase, 1, 1); + +public: + /** + * \param[in] src (N, C, IH, IW) + * \param[out] dst (N, C, OH, OW, window_h, window_w) + * + * \see + * http://deeplearning.net/software/theano/library/tensor/nnet/neighbours.html + * + * \f$ dst_{n, c, oh, ow, wh, ww} = src_{n, c, ih+wh, iw+fw}\f$, + * where \f$ ih=-pad_h+oh*stride_h, iw=-pad_w+ow*stride_w\f$. + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& dst); + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using Images2Neibs = Images2NeibsForward; + +class Images2NeibsBackward : public Images2NeibsBase { + DEF_OPR_IMPL(Images2NeibsBackward, Images2NeibsBase, 1, 1); + +public: + /** + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. src + */ + virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +/** + * \brief base class for Pooling + */ +class PoolingBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(PoolingBase, OperatorBase); + DEF_OPR_PARAM(Pooling); + +public: + using Mode = Param::Mode; + +protected: + void deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); +}; + +class PoolingForward : public PoolingBase { + DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1); + +public: + /** + * \param[in] src input tensor + * \param[out] dst output tensor + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes); +}; + +using Pooling = PoolingForward; + +class PoolingBackward : public PoolingBase { + DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1); + +public: + /** + * \param[in] src the `src' parameter in PoolingForward::exec + * \param[in] dst the `dst' parameter in PoolingForward::exec + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. src + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, + _megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +/** + * \brief base class for Local + */ +class LocalBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(LocalBase, OperatorBase); + DEF_OPR_PARAM(Convolution); + +public: + using Mode = Param::Mode; + +protected: + void deduce_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst); +}; + +class LocalForward : public LocalBase { + DEF_OPR_IMPL(LocalForward, LocalBase, 2, 1); + +public: + /** + * \param[in] src (n, ic, ih, iw) + * \param[in] filter (oh, ow, ic, fh, fw, oc) + * \param[out] dst (n, oc, oh, ow) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + /** + * \brief Deducing output tensor layouts from input tensor layouts. + * + * Be aware that the first and second dimension of `filter' are ignored + * when deducing `dst' layout. + */ + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_in_bytes); +}; +using Local = LocalForward; + +class LocalBackwardData : public LocalBase { + DEF_OPR_IMPL(LocalBackwardData, LocalBase, 2, 1); + +public: + /** + * \param[in] filter (oh, ow, ic, fh, fw, oc) + * \param[in] diff (n, oc, oh, ow) + * \param[out] grad (n, ic, ih, iw) + */ + virtual void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class LocalBackwardFilter : public LocalBase { + DEF_OPR_IMPL(LocalBackwardFilter, LocalBase, 2, 1); + +public: + /** + * \param[in] src (n, ic, ih, iw) + * \param[in] diff (n, oc, oh, ow) + * \param[out] grad (oh, ow, ic, fh, fw, oc) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class BNBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(BNBase, OperatorBase); + DEF_OPR_PARAM(BN); + +protected: + void check_param(); +}; + +class BNForward : public BNBase { + DEF_OPR_IMPL(BNForward, BNBase, 6, 5); + +public: + /** + * \dst[i] = gemma + * *(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + beta \where + * epsilon is a very small value to avoid a "divide by zero" error. + * \param[in] src (n, c, h, w) + * \param[out] dst (n, c, h, w) + * \param[out] mean (see m_param.ParamDim) Global mean. + * \param[out] variance (see m_param.ParamDim) Global variance. + * \Param[out] batch_mean (see m_param.ParamDim) + * Optionally cached intermediate mean from forward pass + * \Param[out] batch_inv_variance (see m_param.ParamDim) + * Optionally cached intermediate variance from forward pass + * src and dst must have the same shape. + * src and dst must be contiguous. + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in bn_scale, + _megdnn_tensor_in bn_bias, _megdnn_tensor_inout mean, + _megdnn_tensor_inout variance, + _megdnn_tensor_out batch_mean, + _megdnn_tensor_out batch_inv_variance, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& bn_scale, + TensorLayout& bn_bias, TensorLayout& mean, + TensorLayout& variance, TensorLayout& batch_mean, + TensorLayout& batch_inv_variance, TensorLayout& dst); + virtual size_t get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& bn_scale, + const TensorLayout& bn_bias, const TensorLayout& mean, + const TensorLayout& variance, const TensorLayout& batch_mean, + const TensorLayout& batch_inv_variance, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& bn_scale, + const TensorLayout& bn_bias, const TensorLayout& mean, + const TensorLayout& variance, + const TensorLayout& batch_mean, + const TensorLayout& batch_inv_variance, + const TensorLayout& dst, size_t workspace_in_bytes); +}; +using BN = BNForward; + +class BNBackward : public BNBase { + DEF_OPR_IMPL(BNBackward, BNBase, 5, 3); + +public: + /** + * \param[in] input data of forwarding propagate. + * \param[in] dy the backpropagated gradient of y. + * \param[out] dx the backpropagated gradient of x. + * \param[out] d_bn_scale, the backpropagated gradient of bn_scale. + * \param[out] d_bn_bias, the backpropagated gradient of bn_bias. + * Optionally cached intermediate results from forward pass + * \param[in] saved_batch_mean mean of the input batch. + Calculated in the forwardpropagation. + * \param[in] saved_batch_variance of the input batch. + Calculated in the forwardpropagation. + */ + virtual void exec(_megdnn_tensor_in x, _megdnn_tensor_in dy, + _megdnn_tensor_in saved_batch_mean, + _megdnn_tensor_in saved_batch_variance, + _megdnn_tensor_in bn_scale, _megdnn_tensor_out d_bn_scale, + _megdnn_tensor_out d_bn_bias, _megdnn_tensor_out dx, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes( + const TensorLayout& x, const TensorLayout& dy, + const TensorLayout& saved_batch_mean, + const TensorLayout& saved_batch_variance, + const TensorLayout& bn_scale, const TensorLayout& d_bn_scale, + const TensorLayout& d_bn_bias, const TensorLayout& dx) = 0; + +protected: + void check_exec(const TensorLayout& x, const TensorLayout& dy, + const TensorLayout& saved_batch_mean, + const TensorLayout& saved_batch_variance, + const TensorLayout& bn_scale, + const TensorLayout& d_bn_scale, + const TensorLayout& d_bn_bias, const TensorLayout& dx, + size_t workspace_in_bytes); +}; + +class LRNBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(LRNBase, OperatorBase); + DEF_OPR_PARAM(LRN); + +protected: + void check_param(); +}; + +class LRNForward : public LRNBase { + DEF_OPR_IMPL(LRNForward, LRNBase, 1, 1); + +public: + /** + * \see ImageNet Classification with Deep Convolutional Neural Networks + * \param[in] src (n, c, h, w) + * \param[out] dst (n, c, h, w) + * + * src and dst must have the same shape. + * src and dst must be contiguous. + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using LRN = LRNForward; + +class LRNBackward : public LRNBase { + DEF_OPR_IMPL(LRNBackward, LRNBase, 3, 1); + +public: + /** + * \param[in] src the `src' parameter in LRNForward::exec + * \param[in] dst the `dst' parameter in LRNForward::exec + * \param[in] diff the backpropagated gradient wrt. dst + * \param[out] grad the backpropagated gradient wrt. src + * + * All tensors should be contiguous and of the same shape. + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, + _megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +class ROIPoolingBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(ROIPoolingBase, OperatorBase); + DEF_OPR_PARAM(ROIPooling); + +protected: + void check_layout_fwd(const TensorLayout& src, const TensorLayout& rois, + const TensorLayout& dst, const TensorLayout& index); +}; + +class ROIPoolingForward : public ROIPoolingBase { + DEF_OPR_IMPL(ROIPoolingForward, ROIPoolingBase, 2, 2); + +public: + /** + * \param[in] src (n, c, ih, iw) + * \param[in] rois (m, 5) + * \param[out] dst (m, c, oh, ow) + * \param[out] index (m, c, oh, ow) if mode is MAX, (0) if mode is AVERAGE + * + * The internal implementation is akin to + * https://github.com/rbgirshick/caffe-fast-rcnn .d + * Note that rois(, 0) denotes the input image index. We store it as + * a float, but it should be an integer instead. + * + * index is a temporary tensor to facilitate its backward operator. + * It is used to store argmax indicex in MAX mode, and it is not used + * in AVERAGE mode. + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in rois, + _megdnn_tensor_out dst, _megdnn_tensor_out index, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& rois, + const TensorLayout& dst, + const TensorLayout& index) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& rois, + const TensorLayout& dst, const TensorLayout& index, + size_t workspace_in_bytes); +}; +using ROIPooling = ROIPoolingForward; + +class ROIPoolingBackward : public ROIPoolingBase { + DEF_OPR_IMPL(ROIPoolingBackward, ROIPoolingBase, 4, 1); + +public: + /** + * \param[in] diff the backpropagated gradient wrt. dst + * \param[in] src the `src' parameter in ROIPoolingForward::exec + * \param[in] rois the `rois' parameter in ROIPoolingForward::exec + * \param[in] index the `index' parameter in ROIPoolingForward::exec + * \param[out] grad the backpropagated gradient wrt. src + */ + virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in src, + _megdnn_tensor_in rois, _megdnn_tensor_in index, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& diff, + const TensorLayout& src, + const TensorLayout& rois, + const TensorLayout& index, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& diff, const TensorLayout& src, + const TensorLayout& rois, const TensorLayout& index, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class Convolution3DBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(Convolution3DBase, OperatorBase); + DEF_OPR_PARAM(Convolution3D); + +public: + static constexpr size_t MAX_SPATIAL_DIM = 3; + using Mode = Param::Mode; + struct CanonizedFilterMeta { + DTypeEnum dtype_enum; + Param::Format format; + uint32_t + //! whether filter should be flipped (i.e. is CONVOLUTION) + should_flip, + group, //!< number of groups + icpg, //!< input channels per group + ocpg, //!< output channels per group + spatial_ndim, stride[MAX_SPATIAL_DIM], padding[MAX_SPATIAL_DIM], + //! spatial dim + spatial[MAX_SPATIAL_DIM], dilation[MAX_SPATIAL_DIM], + //! spatial dim with dilation applied + dilated_spatial[MAX_SPATIAL_DIM]; + } MEGDNN_PACKED; + +protected: + CanonizedFilterMeta deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) const; + CanonizedFilterMeta check_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) const; + + CanonizedFilterMeta make_canonized_filter_meta( + size_t src_ndim, const TensorLayout& filter) const; +}; + +class Convolution3DForward + : public Convolution3DBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(Convolution3DForward, Convolution3DBase, 2, 1); + +public: + /** + * \param[in] src (n, ic, id, ih, iw) + * \param[in] filter (oc, ic, fd, fh, fw) + * \param[out] dst (n, oc, od, oh, ow) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) = 0; + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using Convolution3D = Convolution3DForward; + +class Convolution3DBackwardData + : public Convolution3DBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(Convolution3DBackwardData, Convolution3DBase, 2, 1); + +public: + /** + * \param[in] filter (oc, ic, fd, fh, fw) + * \param[in] diff (n, oc, od, oh, ow) + * \param[out] grad (n, ic, id, ih, iw) + */ + virtual void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + + void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, + TensorLayout& grad); + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +class Convolution3DBackwardFilter + : public Convolution3DBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(Convolution3DBackwardFilter, Convolution3DBase, 2, 1); + +public: + /** + * \param[in] src (n, ic, id, ih, iw) + * \param[in] diff (n, oc, od, oh, ow) + * \param[out] grad (oc, ic, fd, fh, fw) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +class LocalShareBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(LocalShareBase, OperatorBase); + DEF_OPR_PARAM(LocalShare); + +protected: + void deduce_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst); +}; + +class LocalShareForward : public LocalShareBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(LocalShareForward, LocalShareBase, 2, 1); + +public: + /** + * \param[in] src (N, IC, IH, IW) + * \param[in] filter (G, spatial_groups_h, spatial_groups_w, IC / G, + * FH, FW, OC / G) + * \param[out] dst (N, OC, OH, OW) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + /** + * \brief deduce layout of the ouput tensor + */ + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_in_bytes); +}; +using LocalShare = LocalShareForward; + +class LocalShareBackwardData + : public LocalShareBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(LocalShareBackwardData, LocalShareBase, 2, 1); + +public: + /** + * \param[in] filter (G, spatial_groups_h, spatial_groups_w, IC / G, + * FH, FW, OC / G) + * \param[in] diff (N, OC, OH, OW) + * \param[out] grad (N, IC, IH, IW) + */ + virtual void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, + TensorLayout& grad); + +protected: + void check_exec(const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class LocalShareBackwardFilter + : public LocalShareBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(LocalShareBackwardFilter, LocalShareBase, 2, 1); + +public: + /** + * \param[in] src (N, IC, IH, IW) + * \param[in] diff (N, OC, OH, OW) + * \param[out] grad (G, spatial_groups_h, spatial_groups_w, IC / G, + * FH, FW, OC / G) + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; + + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes); +}; + +class ROIAlignBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(ROIAlignBase, OperatorBase); + DEF_OPR_PARAM(ROIAlign); + +protected: + void deduce_layout_fwd(const TensorLayout& src, const TensorLayout& rois, + TensorLayout& dst, TensorLayout& index); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& rois, + const TensorLayout& dst, const TensorLayout& index); +}; + +class ROIAlignForward : public ROIAlignBase { + DEF_OPR_IMPL(ROIAlignForward, ROIAlignBase, 2, 2); + +public: + /** + * \param[in] src (n, c, ih, iw) + * \param[in] rois (m, 5) + * \param[out] dst (m, c, oh, ow) + * \param[out] index (m, c, oh, ow) if mode is MAX, (0) if mode is AVERAGE + * + * Note that rois(, 0) denotes the input image index. We store it as + * a float, but it should be an integer instead. + * + * index is a temporary tensor to facilitate its backward operator. + * It is used to store argmax indicex in MAX mode, and it is not used + * in AVERAGE mode. + */ + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in rois, + _megdnn_tensor_out dst, _megdnn_tensor_out index, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& src, const TensorLayout& rois, + TensorLayout& dst, TensorLayout& index); + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& rois, + const TensorLayout& dst, + const TensorLayout& index) = 0; + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& rois, + const TensorLayout& dst, const TensorLayout& index, + size_t workspace_in_bytes); +}; +using ROIAlign = ROIAlignForward; + +class ROIAlignBackward : public ROIAlignBase { + DEF_OPR_IMPL(ROIAlignBackward, ROIAlignBase, 3, 1); + +public: + /** + * \param[in] diff the backpropagated gradient wrt. dst + * \param[in] rois the `rois' parameter in ROIAlignForward::exec + * \param[in] index the `index' parameter in ROIAlignForward::exec + * \param[out] grad the backpropagated gradient wrt. src + */ + virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in rois, + _megdnn_tensor_in index, _megdnn_tensor_out grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& diff, + const TensorLayout& rois, + const TensorLayout& index, + const TensorLayout& grad) = 0; + +protected: + void check_exec(const TensorLayout& diff, const TensorLayout& rois, + const TensorLayout& index, const TensorLayout& grad, + size_t workspace_in_bytes); +}; + +class DeformableConvBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(DeformableConvBase, OperatorBase); + DEF_OPR_PARAM(Convolution); + +public: + static constexpr size_t MAX_SPATIAL_DIM = 2; + struct CanonizedFilterMeta : Convolution::CanonizedFilterMeta { + uint32_t deformable_group; + }; + +protected: + CanonizedFilterMeta make_canonized_filter_meta( + size_t src_ndim, const TensorLayout& filter, + const TensorLayout& offset) const; + void deduce_layout_fwd(const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& mask, const TensorLayout& offset, + TensorLayout& dst); + void check_layout_fwd(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& mask, const TensorLayout& offset, + const TensorLayout& dst); +}; + +class DeformableConvForward + : public DeformableConvBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(DeformableConvForward, DeformableConvBase, 4, 1); + +public: + /** + * \param[in] im (n, ic, ih, iw) + * \param[in] filter (oc, ic, fh, fw) + * \param[in] offset (dg, 2, fh, fw, oh, ow) + * \param[in] mask (dg, fh, fw, oh, ow) + * \param[out] dst (n, oc, oh, ow) + */ + virtual void exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + TensorLayout& dst); + virtual size_t get_workspace_in_bytes(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst) = 0; + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using DeformableConv = DeformableConvForward; + +/** + * \brief DeformableConvBackwardFilter operator. + * + * Calculating the gradient wrt. convolution filter. + */ +class DeformableConvBackwardFilter + : public DeformableConvBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(DeformableConvBackwardFilter, DeformableConvBase, 4, 1); + +public: + /** + * \param[in] im (oc, ic, fh, fw) + * \param[in] offset (dg, 2, fh, fw, oh, ow) + * \param[in] mask (dg, fh, fw, oh, ow) + * \param[in] out_grad (n, oc, oh, ow) + * \param[out] filter_grad (oc, ic, ih, iw) + */ + virtual void exec(_megdnn_tensor_in im, _megdnn_tensor_in offset, + _megdnn_tensor_in mask, _megdnn_tensor_in out_grad, + _megdnn_tensor_out filter_grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& im, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& out_grad, + const TensorLayout& filter_grad) = 0; + void deduce_layout(const TensorLayout& im, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + TensorLayout& filter_grad); + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& im, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& out_grad, + const TensorLayout& filter_grad, + size_t workspace_in_bytes); +}; + +/** + * \brief DeformableConvBackwardData operator. + * + * Calculating the gradient wrt. convolution input data, offset and mask. + */ +class DeformableConvBackwardData + : public DeformableConvBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(DeformableConvBackwardData, DeformableConvBase, 5, 3); + +public: + /** + * \param[in] im (oc, ic, fh, fw) + * \param[in] filter (oc, ic, fh, fw) + * \param[in] offset (dg, 2, fh, fw, oh, ow) + * \param[in] mask (dg, fh, fw, oh, ow) + * \param[in] out_grad (n, oc, oh, ow) + * \param[out] im_grad (n, ic, ih, iw) + * \param[out] offset_grad (dg, 2, fh, fw, oh, ow) + * \param[out] mask_grad (dg, fh, fw, oh, ow) + */ + virtual void exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, + _megdnn_tensor_out offset_grad, + _megdnn_tensor_out mask_grad, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad) = 0; + void deduce_layout(const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, TensorLayout& im_grad, + TensorLayout& offset_grad, TensorLayout& mask_grad); + +protected: + CanonizedFilterMeta check_exec( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad, + size_t workspace_in_bytes); +}; + +class DeformablePSROIPoolingBase : public OperatorBase { + DEF_OPR_IMPL_CTOR(DeformablePSROIPoolingBase, OperatorBase); + DEF_OPR_PARAM(DeformablePSROIPooling); + +protected: + void deduce_layout_fwd(const TensorLayout& data, const TensorLayout& trans, + const TensorLayout& rois, TensorLayout& out_data, + TensorLayout& out_count); + + void check_layout_fwd(const TensorLayout& data, const TensorLayout& trans, + const TensorLayout& rois, + const TensorLayout& out_data, + const TensorLayout& out_count, + size_t workspace_in_bytes); +}; + +class DeformablePSROIPoolingForward : public DeformablePSROIPoolingBase { + DEF_OPR_IMPL(DeformablePSROIPoolingForward, DeformablePSROIPoolingBase, 3, + 2); + +public: + /** + * \param[in] data (oc, ic, ih, iw) + * \param[in] rois (xx, xx, xx, xx) + * \param[in] trans (oc, ic, fh, fw) + * \param[out] out_data ( n, ic, ih, iw) + * \param[out] out_count ( n, ic, ih, iw) + */ + virtual size_t get_workspace_in_bytes(const TensorLayout& data, + const TensorLayout& rois, + const TensorLayout& trans, + const TensorLayout& out_data, + const TensorLayout& out_count) = 0; + virtual void exec(_megdnn_tensor_in data, _megdnn_tensor_in rois, + _megdnn_tensor_in trans, _megdnn_tensor_out out_data, + _megdnn_tensor_out out_count, + _megdnn_workspace workspace) = 0; + void deduce_layout(const TensorLayout& data, const TensorLayout& rois, + const TensorLayout& trans, TensorLayout& out_data, + TensorLayout& out_count); + void check_exec(const TensorLayout& data, const TensorLayout& rois, + const TensorLayout& trans, const TensorLayout& out_data, + const TensorLayout& out_count, size_t workspace_in_bytes); +}; + +using DeformablePSROIPooling = DeformablePSROIPoolingForward; + +class DeformablePSROIPoolingBackward : public DeformablePSROIPoolingBase { + DEF_OPR_IMPL(DeformablePSROIPoolingBackward, DeformablePSROIPoolingBase, 5, + 2); + +public: + /** + * \param[in] data (oc, ic, ih, iw) + * \param[in] rois (xx, xx, xx, xx) + * \param[in] trans (oc, ic, fh, fw) + * \param[in] out_diff (xx, xx, xx, xx) + * \param[in] out_count (xx, xx, xx, xx) + * \param[out] data_diff ( n, ic, ih, iw) + * \param[out] trans_diff ( n, ic, ih, iw) + */ + virtual void exec(_megdnn_tensor_in data, _megdnn_tensor_in rois, + _megdnn_tensor_in trans, _megdnn_tensor_in out_diff, + _megdnn_tensor_in out_count, _megdnn_tensor_out data_diff, + _megdnn_tensor_out trans_diff, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout& data, + const TensorLayout& rois, + const TensorLayout& trans, + const TensorLayout& out_diff, + const TensorLayout& out_count, + const TensorLayout& data_diff, + const TensorLayout& trans_diff) = 0; + + void check_exec(const TensorLayout& data, const TensorLayout& rois, + const TensorLayout& trans, const TensorLayout& out_diff, + const TensorLayout& out_count, + const TensorLayout& data_diff, + const TensorLayout& trans_diff, size_t workspace_in_bytes); +}; + +class BatchConvBiasForward + : public ConvolutionBase, + public detail::MultiAlgoOpr { + DEF_OPR_IMPL(BatchConvBiasForward, ConvolutionBase, 4, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; + + void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); + void deduce_layout(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + TensorLayout& dst); + + virtual size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) = 0; + +protected: + CanonizedFilterMeta check_exec(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst, + size_t workspace_in_bytes); +}; +using BatchConvBias = BatchConvBiasForward; + +} // namespace megdnn +#include "megdnn/internal/opr_header_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/nn_int.h b/dnn/include/megdnn/oprs/nn_int.h new file mode 100644 index 00000000..19fe69e1 --- /dev/null +++ b/dnn/include/megdnn/oprs/nn_int.h @@ -0,0 +1,70 @@ +/** + * \file dnn/include/megdnn/oprs/nn_int.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/internal/opr_header_prologue.h" + +namespace megdnn { + +/*! + * \brief element-wise operator that allows input/output vars to have different + * data types + * + * The data types are typically different int types. + */ +class ElemwiseMultiType : public OperatorBase { + DEF_OPR_PARAM(ElemwiseMultiType); + DEF_OPR_IMPL(ElemwiseMultiType, OperatorBase, -1, 1); + + //! check dtype function + using CheckDtypeFunc = thin_function; + //! check the dtype if is_check is true, otherwise setup dtype. + using SetOrCheckDtypeFunc = thin_function; + +public: + using Mode = Param::Mode; + static constexpr size_t MAX_ARITY = 6; + + //! information about a mode + struct ModeTrait { + uint32_t arity = 0; //!< number of inputs needed + CheckDtypeFunc check_inp[MAX_ARITY]; + SetOrCheckDtypeFunc check_out; //!< dtype of output var + bool need_specify_out_dtype = + false; //!< the dtype should be setup externally, otherwise + //!< would be inferred by check_out(dtype, false) + const char* name = nullptr; //!< name of the mode + + //! get trait from a mode; this function is thread safe + static const ModeTrait& from_mode(Mode mode); + }; + + virtual void exec(_megdnn_in const TensorNDArray& src, + _megdnn_tensor_out dst) = 0; + + //! get trait of current mode + const ModeTrait& mode_trait() const { + return ModeTrait::from_mode(m_param.mode); + } + + //! deduce output layout + void deduce_layout(const TensorLayoutArray& src, TensorLayout& dst); + +protected: + //! throw exception if incorrect layout; broadcast input shape to + //! output shape + void check_layout_and_broadcast(const TensorLayoutPtrArray& src, + const TensorLayout& dst); +}; + +} // namespace megdnn + +#include "megdnn/internal/opr_header_epilogue.h" +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/oprs/utils.h b/dnn/include/megdnn/oprs/utils.h new file mode 100644 index 00000000..03957fd2 --- /dev/null +++ b/dnn/include/megdnn/oprs/utils.h @@ -0,0 +1,121 @@ +/** + * \file dnn/include/megdnn/oprs/utils.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/internal/opr_header_prologue.h" + +namespace megdnn { + +//! base class for random number generators +class RNGBase: public OperatorBase { + DEF_OPR_IMPL_CTOR(RNGBase, OperatorBase); + public: + virtual void exec(_megdnn_tensor_out dst, + _megdnn_workspace workspace) = 0; + virtual size_t get_workspace_in_bytes(const TensorLayout &dst) = 0; + protected: + void check_exec(const TensorLayout &dst, size_t workspace_in_bytes); +}; + +//! sample from uniform distribution on the interval (0, 1] +class UniformRNG: public RNGBase { + DEF_OPR_IMPL(UniformRNG, RNGBase, 0, 1); + DEF_OPR_PARAM(UniformRNG); +}; + +//! sample from gaussian distribution +class GaussianRNG: public RNGBase { + DEF_OPR_IMPL(GaussianRNG, RNGBase, 0, 1); + DEF_OPR_PARAM(GaussianRNG); +}; + +/*! + * \brief sleep for specific time on the computing device; useful for testing + * async problems + */ +class SleepForward: public OperatorBase { + DEF_OPR_IMPL(SleepForward, OperatorBase, 0, 0); + DEF_OPR_PARAM(Sleep); + + public: + virtual void exec() = 0; +}; +using Sleep = SleepForward; + +/*! + * \brief calculating checksum of a tensor + * + * data must be a one-dimensional contiguous tensor with dtype byte + */ +class ChecksumForward: public OperatorBase { + DEF_OPR_PARAM(Empty); + DEF_OPR_IMPL(ChecksumForward, OperatorBase, 0, 1); + + public: + using Result = opr_result::Checksum; + + virtual size_t get_workspace_in_bytes(const TensorLayout &data) = 0; + + virtual Result exec(_megdnn_tensor_in data, + _megdnn_workspace workspace) = 0; + + protected: + void check_exec(const TensorLayout &layout, size_t workspace_in_bytes); +}; +using Checksum = ChecksumForward; + +/*! + * \brief calculating max absolute difference of the two input tensors + * + * src1 and src2 must be a one-dimensional contiguous tensor. + */ +class MaxTensorDiff : public OperatorBase { + DEF_OPR_PARAM(Empty); + DEF_OPR_IMPL(MaxTensorDiff, OperatorBase, 0, 2); + + public: + virtual size_t get_workspace_in_bytes(const TensorLayout& layout1, + const TensorLayout& layout2) = 0; + + virtual float exec(_megdnn_tensor_in src1, _megdnn_tensor_in src2, + _megdnn_workspace workspace) = 0; + + protected: + void check_exec(const TensorLayout& layout1, + const TensorLayout& layout2, size_t workspace_in_bytes); +}; + +/*! + * \brief winograd preprocess opr. + * + * for the detail \see src/fallback/conv_bias/winograd/winograd.h + * + */ +class WinogradFilterPreprocess : public OperatorBase { + DEF_OPR_PARAM(Winograd); + DEF_OPR_IMPL(WinogradFilterPreprocess, OperatorBase, 1, 1); + +public: + virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace) = 0; + + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&); + + void deduce_layout(const TensorLayout& src, TensorLayout& dst); + +protected: + void check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes); +}; +} // namespace megdnn + +#include "megdnn/internal/opr_header_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/tensor_format.h b/dnn/include/megdnn/tensor_format.h new file mode 100644 index 00000000..46347f3b --- /dev/null +++ b/dnn/include/megdnn/tensor_format.h @@ -0,0 +1,227 @@ +/** + * \file dnn/include/megdnn/tensor_format.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/basic_types.h" + +#include "megdnn/internal/visibility_prologue.h" +namespace megdnn { + +enum class TensorFormat::Type { + DEFAULT = 0, //!< see DefaultTensorFormat + IMAGE2D_PACK4 = 1, //!< see Image2DPack4TensorFormat +}; + +class TensorFormat::ImplBase { +public: + using Type = TensorFormat::Type; + + virtual size_t init_contiguous_stride(TensorLayout& layout) const = 0; + + virtual bool is_contiguous_spec(const TensorLayout& layout) const = 0; + + virtual TensorLayout collapse_contiguous_spec( + const TensorLayout& layout) const = 0; + + virtual TensorLayout::Span span_spec(const TensorLayout& layout) const = 0; + + //! a human-readable string description of this TensorFormat + virtual std::string to_string() const = 0; + + virtual void serialize_append(std::string& result) const = 0; + + Type type() const { return m_type; } + +protected: + ImplBase(Type type) : m_type{type} {} + ~ImplBase() = default; + + static TensorFormat impl_to_tensor_format(ImplBase* impl) { return {impl}; } + +private: + Type m_type; +}; + +TensorFormat::Type TensorFormat::type() const { + return m_impl->type(); +} + +//! default tensor format that imposes no stride constraints +class DefaultTensorFormat final : public TensorFormat::ImplBase { +public: + static constexpr Type TYPE = Type::DEFAULT; + + DefaultTensorFormat() : ImplBase(TYPE) {} + + size_t init_contiguous_stride(TensorLayout& layout) const override; + + /*! + * \brief A tensor is contiguous if logical offset in row-major of any + * element always equals to its physical offset (i.e. offset considering + * strides). + * + * Empty tensors are not considered to be contiguous. + */ + bool is_contiguous_spec(const TensorLayout& layout) const override; + + TensorLayout collapse_contiguous_spec( + const TensorLayout& layout) const override; + + TensorLayout::Span span_spec(const TensorLayout& layout) const override; + + std::string to_string() const override; + void serialize_append(std::string& result) const override; + + static TensorFormat make(); + static TensorFormat deserialize(const Handle* handle, const void* buf, + size_t size); +}; + +namespace detail { + +/*! + * \brief 2D image with requirement on row stride + * + * \p align_axis is the axis to be aligned, also the first axis of image width. + * More precisely speaking, `stride[align_axis-1] * dtype.size()` must divide \p + * align_size_in_byte. Axes from 0 to align_axis-1 would be considered as the + * height of the image, and other axes are the width. + * + * Empty tensors and negative strides are not allowed. Only contiguous or + * broadcasted cases are allowed. + * + * Note: if `stride[align_axis - 1]` is larger than minimal value, it is still + * considered as contiguous. + */ +class Image2DTensorFormatBase : public TensorFormat::ImplBase { + size_t m_align_axis, m_align_size_in_byte_log2; + +protected: + Image2DTensorFormatBase(Type type, size_t align_axis, + size_t align_size_in_byte); + ~Image2DTensorFormatBase() = default; + +public: + /*! + * \brief get alignment requirement in bytes + * \param div_log2 the result would be divided by `(1 << div_log2)` + */ + size_t align_size_in_byte(size_t div_log2 = 0) const { + return 1 << (m_align_size_in_byte_log2 > div_log2 + ? m_align_size_in_byte_log2 - div_log2 + : 0); + } + + size_t align_axis() const { return m_align_axis; } + + size_t init_contiguous_stride(TensorLayout& layout) const override; + + bool is_contiguous_spec(const TensorLayout& layout) const override; + + TensorLayout collapse_contiguous_spec( + const TensorLayout& layout) const override; + + //! span for image must include the padding at the last row + TensorLayout::Span span_spec(const TensorLayout& layout) const override; + + std::string to_string() const override; + + //! raise exception if preconditions violated + virtual void assert_valid(const TensorLayout& layout) const; + + //! modify the align axis and return a new TensorFormat + virtual TensorFormat change_axis(size_t axis) const = 0; + + //! number of dtype elems in each row, considering strides + size_t image_width_elems(const TensorLayout& layout) const; + + //! number of rows + size_t image_height(const TensorLayout& layout) const; + + //! delta of addresses of consecutive rows (in bytes) + size_t image_row_pitch(const TensorLayout& layout) const; + + void serialize_append(std::string& result) const override; +protected: + struct SerializePack { + uint8_t align_axis; + }; +}; + +template +class Image2DPackedTensorFormatBase : public Image2DTensorFormatBase { +protected: + using Image2DTensorFormatBase::Image2DTensorFormatBase; + ~Image2DPackedTensorFormatBase() = default; + +public: + /*! + * \brief image width in logical pixels exclude padding + * + * It is the number of accessible elems (in dtype) divided by PIXEL_SIZE. + * + * \see image_row_pitch() + */ + size_t image_width(const TensorLayout& layout) const; + + void assert_valid(const TensorLayout& layout) const override; +}; +using Image2DPack4TensorFormatBase = Image2DPackedTensorFormatBase<4>; +} // namespace detail + +/*! + * \brief 2D image that requires stride of width to be aligned, and pack 4 elems + * into a pixel + * + * This is used for OpenCL. + */ +class Image2DPack4TensorFormat final + : public detail::Image2DPack4TensorFormatBase { +public: + static constexpr Type TYPE = Type::IMAGE2D_PACK4; + + //! for internal usage or test purposes + static TensorFormat make_raw(size_t align_axis, size_t align_size_in_byte); + + static TensorFormat make(size_t align_axis, const Handle* handle); + + /*! + * \brief deserialize on a handle + * + * Note that the alignment may be different if deserialized on another + * handle + */ + static TensorFormat deserialize(const Handle* handle, const void* buf, + size_t size); + + static bool is_valid_image(const TensorLayout& layout) { + if (layout.format.type() == TYPE) { + layout.format.as_impl().assert_valid( + layout); + return true; + } + return false; + } + + TensorFormat change_axis(size_t axis) const override; + +private: + Image2DPack4TensorFormat(size_t align_axis, size_t align_size_in_byte) + : detail::Image2DPack4TensorFormatBase(TYPE, align_axis, + align_size_in_byte) {} +}; + +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/tensor_iter.h b/dnn/include/megdnn/tensor_iter.h new file mode 100644 index 00000000..e6d7cd2f --- /dev/null +++ b/dnn/include/megdnn/tensor_iter.h @@ -0,0 +1,199 @@ +/** + * \file dnn/include/megdnn/tensor_iter.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/basic_types.h" +#include "megdnn/dtype.h" +#include "megdnn/internal/visibility_prologue.h" + +namespace megdnn { + +template +class TypeRef { +public: + using dtype = T&; + static T& get(T* _ptr, size_t _offset) { + T& ret = _ptr[_offset]; + return ret; + } +}; + +template <> +class TypeRef { +private: + uint8_t* ptr = nullptr; + size_t offset = 0; + +public: + using dtype = TypeRef; + dt_quint4 val = dt_quint4(0); + + TypeRef(dt_quint4* _ptr, size_t _offset); + + void operator=(const uint8_t _); + void operator=(const dt_quint4& _) { *this = _.as_uint8(); } + void operator=(const TypeRef& _) { *this = _.val.as_uint8(); } + operator dt_quint4() const { return val; } + operator uint8_t() const { return val.as_uint8(); } + + static TypeRef get(dt_quint4* _ptr, size_t _offset) { + return TypeRef(_ptr, _offset); + } +}; + +template <> +class TypeRef { +private: + int8_t* ptr = nullptr; + size_t offset = 0; + +public: + using dtype = TypeRef; + dt_qint4 val = dt_qint4(0); + TypeRef(dt_qint4* _ptr, size_t _offset); + + void operator=(const int8_t _); + void operator=(const dt_qint4& _) { *this = _.as_int8(); } + void operator=(const TypeRef& _) { *this = _.val.as_int8(); } + operator dt_qint4() const { return val; } + operator int8_t() const { return val.as_int8(); } + + static TypeRef get(dt_qint4* _ptr, size_t _offset) { + return TypeRef(_ptr, _offset); + } +}; + +/*! + * \brief helper for iterating on a tensor with arbitrary layout + * \tparam ctype tensor element plain data type + * \tparam valonly whether only value is needed (so logical index does not need + * to be maintained) + */ +template +class TensorIter { + TensorND m_tensor; + +public: + class Iter { + MEGDNN_NORETURN void on_access_idx_valonly_true() const; + + ctype* m_ptr = nullptr; + + TensorLayout m_layout; + + ptrdiff_t m_axis_reset_stride[TensorShape::MAX_NDIM], + m_offset = 0; //!< physical offset in buffer + + //! offset in each axis + size_t m_axis_offset[TensorShape::MAX_NDIM], + m_logical_offset = 0, //!< contiguous logical offset + m_tot_nr_elems = 0; //!< tot elems (max logical offset) + + public: + Iter() { + memset(m_axis_reset_stride, 0, sizeof(m_axis_reset_stride)); + memset(m_axis_offset, 0, sizeof(m_axis_offset)); + } + + /*! + * \brief create an iterator + */ + static Iter make(ctype* ptr, const TensorLayout& layout, size_t offset); + + static Iter make(TensorND& t, size_t offset) { + return make(t.ptr(), t.layout, offset); + } + + //! access element without boundary check + typename TypeRef::dtype operator*() { + return TypeRef::get(m_ptr, m_offset); + }; + + Iter& operator++() { + if ((++m_logical_offset) == m_tot_nr_elems) + return *this; + auto mem_offset = m_offset; + for (int axis = m_layout.ndim - 1;; axis--) { + size_t& ax_offset = ++m_axis_offset[axis]; + if (ax_offset < m_layout.shape[axis]) { + mem_offset += m_layout.stride[axis]; + break; + } else { + ax_offset = 0; + mem_offset -= m_axis_reset_stride[axis]; + } + } + m_offset = mem_offset; + return *this; + } + + //! whether current value valid + bool valid() const { return m_logical_offset < m_tot_nr_elems; } + + //! whether current pos is at end of buffer + bool at_end() const { return m_logical_offset == m_tot_nr_elems; } + + //! get logical index; valonly must be false + const size_t* idx() const { + if (valonly) + on_access_idx_valonly_true(); + return m_axis_offset; + } + + /*! + * \brief memory address offset, measured in number of elements + */ + size_t offset() const { return m_offset; } + + /*! + * \brief number of elements from first element + */ + size_t logical_offset() const { return m_logical_offset; } + + bool operator!=(const Iter& rhs) const { + return m_logical_offset != rhs.m_logical_offset; + } + }; + TensorIter() = default; + + TensorIter(const TensorND& tensor) : m_tensor(tensor) {} + + Iter begin() const { + return Iter::make(const_cast(m_tensor), 0); + } + + Iter end() const { + return Iter::make(const_cast(m_tensor), + m_tensor.layout.total_nr_elems()); + } +}; +/*! + * \brief iterate over elements of a tensor; only access tensor value + */ +template +TensorIter tensor_iter_valonly(const TensorND& t) { + return {t}; +} + +/*! + * \brief iterate over elements of a tensor, retaining logical index + */ +template +TensorIter tensor_iter(const TensorND& t) { + return {t}; +} + +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/include/megdnn/thin/function.h b/dnn/include/megdnn/thin/function.h new file mode 100644 index 00000000..632fd27c --- /dev/null +++ b/dnn/include/megdnn/thin/function.h @@ -0,0 +1,30 @@ +/** + * \file dnn/include/megdnn/thin/function.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "megdnn/internal/visibility_prologue.h" + +namespace megdnn { +template +using thin_function = ::std::function; + +} // namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/include/megdnn/thin/small_vector.h b/dnn/include/megdnn/thin/small_vector.h new file mode 100644 index 00000000..338d2466 --- /dev/null +++ b/dnn/include/megdnn/thin/small_vector.h @@ -0,0 +1,917 @@ +/** + * \file dnn/include/megdnn/thin/small_vector.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the SmallVector class. +// +//===----------------------------------------------------------------------===// +/** + * \file include/megdnn/thin/small_vector.h + * + * This file is part of MegDNN, a deep neural network run-time library + * developed by Megvii. + * + * \brief thin megdnn function + * + * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + */ +#pragma once + +#include "megdnn/arch.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "megdnn/internal/visibility_prologue.h" + +namespace megdnn { + +class SmallVectorBase { +protected: + void *m_begin_ptr, *m_end_ptr, *m_capacity_ptr; + + MEGDNN_NORETURN static void on_invalid_at(size_t idx, size_t size); + +protected: + SmallVectorBase(void* first_elm, size_t size) + : m_begin_ptr(first_elm), + m_end_ptr(first_elm), + m_capacity_ptr(static_cast(first_elm) + size) {} + + void grow_pod(void* first_elm_ptr, size_t min_sz_in_bytes, + size_t type_size); + +public: + size_t size_in_bytes() const { + return size_t(static_cast(m_end_ptr) - + static_cast(m_begin_ptr)); + } + + size_t capacity_in_bytes() const { + return size_t(static_cast(m_capacity_ptr) - + static_cast(m_begin_ptr)); + } + + bool empty() const { return m_begin_ptr == m_end_ptr; } +}; +template +class SmallVectorTemplateCommon : public SmallVectorBase { +private: + template + friend struct SmallVectorStorage; + + using U = typename std::aligned_storage::type; + + U m_first_elm; + +protected: + SmallVectorTemplateCommon(size_t size) + : SmallVectorBase(&m_first_elm, size) {} + + void grow_pod(size_t min_sz_in_bytes, size_t type_size) { + SmallVectorBase::grow_pod(&m_first_elm, min_sz_in_bytes, type_size); + } + + bool is_small() { + return m_begin_ptr == static_cast(&m_first_elm); + } + + void reset_to_small() { + m_begin_ptr = m_end_ptr = m_capacity_ptr = &m_first_elm; + } + + void set_end(T* p) { m_end_ptr = p; } + +public: + using size_type = size_t; + using difference_type = std::ptrdiff_t; + using value_type = T; + using iterator = T*; + using const_iterator = const T*; + + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + using reference = T&; + using const_reference = const T&; + using pointer = T*; + using const_pointer = const T*; + + size_t capacity() const { return capacity_ptr() - begin(); } + +protected: + iterator capacity_ptr() { return static_cast(m_capacity_ptr); } + const_iterator capacity_ptr() const { + return static_cast(m_capacity_ptr); + } + +public: + // forwarding iterator creation + iterator begin() { return static_cast(m_begin_ptr); } + const_iterator begin() const { + return static_cast(m_begin_ptr); + } + const_iterator cbegin() const { + return static_cast(m_begin_ptr); + } + + iterator end() { return static_cast(m_end_ptr); } + const_iterator end() const { + return static_cast(m_end_ptr); + } + const_iterator cend() const { + return static_cast(m_end_ptr); + } + + reference at(size_type idx) { + if (idx >= size()) { + on_invalid_at(idx, size()); + } + return begin()[idx]; + } + const_reference at(size_type idx) const { + if (idx >= size()) { + on_invalid_at(idx, size()); + } + return begin()[idx]; + } + + reference operator[](size_type idx) { return begin()[idx]; } + const_reference operator[](size_type idx) const { return begin()[idx]; } + + reference front() { return begin()[0]; } + const_reference front() const { return begin()[0]; } + + reference back() { return rbegin()[0]; } + const_reference back() const { return rbegin()[0]; } + + // reverse iterator creation method. + reverse_iterator rbegin() { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { return reverse_iterator(begin()); } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + pointer data() { return pointer(begin()); } + const_pointer data() const { return const_pointer(begin()); } + + size_type size() const { return end() - begin(); } + size_type max_size() const { + return std::numeric_limits::max() / sizeof(T); + } + + template + in_iter find(in_iter first, in_iter last, const T& value) const { + while (first != last) { + if (*first == value) + return first; + ++first; + } + return last; + } +}; +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t size) : SmallVectorTemplateCommon(size) {} + + static void destroy_range(T* start, T* end) { + while (start != end) { + --end; + end->~T(); + } + } + + template + static void uninitialized_move(It1 first, It1 last, It2 dest) { + std::uninitialized_copy(std::make_move_iterator(first), + std::make_move_iterator(last), dest); + } + + template + static void uninitialized_copy(It1 first, It1 last, It2 dest) { + std::uninitialized_copy(first, last, dest); + } + + void grow(size_t min_sz = 0); + +public: + void push_back(const T& _elm) { + if (megdnn_unlikely(this->m_end_ptr >= this->m_capacity_ptr)) { + T elm = _elm; + this->grow(); + new (static_cast(this->end())) T(std::move(elm)); + } else { + new (static_cast(this->end())) T(_elm); + } + this->set_end(this->end() + 1); + } + + void push_back(T&& elm) { + if (megdnn_unlikely(this->m_end_ptr >= this->m_capacity_ptr)) { + this->grow(); + } + new (static_cast(this->end())) T(std::move(elm)); + this->set_end(this->end() + 1); + } + + void pop_back() { + this->set_end(this->end() - 1); + this->end()->~T(); + } +}; +template +void SmallVectorTemplateBase::grow(size_t min_sz) { + size_t cur_capacity = this->capacity(); + size_t cur_sz = this->size(); + size_t new_capacity = (cur_capacity + 2) * 2; + if (new_capacity < min_sz) { + new_capacity = min_sz; + } + T* elms = static_cast(malloc(new_capacity * sizeof(T))); + + this->uninitialized_move(this->begin(), this->end(), elms); + + this->destroy_range(this->begin(), this->end()); + + if (!this->is_small()) { + free(this->begin()); + } + + this->m_begin_ptr = elms; + this->set_end(elms + cur_sz); + this->m_capacity_ptr = this->begin() + new_capacity; +} + +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t size) : SmallVectorTemplateCommon(size) {} + + static void destroy_range(T*, T*) {} + + template + static void uninitialized_move(It1 first, It1 last, It2 dest) { + uninitialized_copy(first, last, dest); + } + + template + static void uninitialized_copy(It1 first, It1 last, It2 dest) { + std::uninitialized_copy(first, last, dest); + } + + template + static void uninitialized_copy( + T1* first, T1* last, T2* dest, + typename std::enable_if::type, T2>::value>::type* = + nullptr) { + if (first != last) + memcpy(dest, first, (last - first) * sizeof(T)); + } + + void grow(size_t min_sz = 0) { + this->grow_pod(min_sz * sizeof(T), sizeof(T)); + } + +public: + void push_back(const T& _elm) { + if (megdnn_unlikely(this->m_end_ptr >= this->m_capacity_ptr)) { + T elm = _elm; + this->grow(); + memcpy(this->end(), &elm, sizeof(T)); + } else { + memcpy(this->end(), &_elm, sizeof(T)); + } + this->set_end(this->end() + 1); + } + + void pop_back() { this->set_end(this->end() - 1); } +}; + +/*! + * \brief the implementation class of SmallVector + * + * SmallVector can be converted to SmallVectorImpl to erase N + */ +template +class SmallVectorImpl + : public SmallVectorTemplateBase::value> { + using SuperClass = SmallVectorTemplateBase::value>; + +public: + using iterator = typename SuperClass::iterator; + using const_iterator = typename SuperClass::const_iterator; + using size_type = typename SuperClass::size_type; + +protected: + explicit SmallVectorImpl(unsigned n) + : SmallVectorTemplateBase::value>(n * sizeof(T)) { + } + +public: + SmallVectorImpl(const SmallVectorImpl&) = delete; + + ~SmallVectorImpl() { + this->destroy_range(this->begin(), this->end()); + + if (!this->is_small()) + free(this->begin()); + } + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->m_end_ptr = this->m_begin_ptr; + } + + void resize(size_type n) { + if (n < this->size()) { + this->destroy_range(this->begin() + n, this->end()); + this->set_end(this->begin() + n); + } else if (n > this->size()) { + if (this->capacity() < n) + this->grow(n); + for (auto it = this->end(), end = this->begin() + n; it != end; + ++it) + new (&*it) T(); + this->set_end(this->begin() + n); + } + } + + void resize(size_type n, const T& _nv) { + T nv = _nv; + if (n < this->size()) { + this->destroy_range(this->begin() + n, this->end()); + this->set_end(this->begin() + n); + } else if (n > this->size()) { + if (this->capacity() < n) + this->grow(n); + std::uninitialized_fill(this->end(), this->begin() + n, nv); + this->set_end(this->begin() + n); + } + } + + void reserve(size_type n) { + if (this->capacity() < n) { + this->grow(n); + } + } + + T pop_back_val() { + T result = std::move(this->back()); + this->pop_back(); + return result; + } + + void swap(SmallVectorImpl& rhs); + + /// Add the specified range to the end of the SmallVector. + template ::iterator_category, + std::input_iterator_tag>::value>::type> + void append(in_iter in_start, in_iter in_end) { + size_type num_inputs = std::distance(in_start, in_end); + // Grow allocated space if needed. + if (num_inputs > size_type(this->capacity_ptr() - this->end())) + this->grow(this->size() + num_inputs); + + // Copy the new elements over. + this->uninitialized_copy(in_start, in_end, this->end()); + this->set_end(this->end() + num_inputs); + } + + /// Add the specified range to the end of the SmallVector. + void append(size_type num_inputs, const T& _elm) { + T elm = _elm; + // Grow allocated space if needed. + if (num_inputs > size_type(this->capacity_ptr() - this->end())) + this->grow(this->size() + num_inputs); + + // Copy the new elements over. + std::uninitialized_fill_n(this->end(), num_inputs, elm); + this->set_end(this->end() + num_inputs); + } + + void append(std::initializer_list init_list) { + append(init_list.begin(), init_list.end()); + } + + // FIXME: Consider assigning over existing elements, rather than clearing & + // re-initializing them - for all assign(...) variants. + + void assign(size_type num_elms, const T& _elm) { + T elm = _elm; + clear(); + if (this->capacity() < num_elms) + this->grow(num_elms); + this->set_end(this->begin() + num_elms); + std::uninitialized_fill(this->begin(), this->end(), elm); + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + void assign(in_iter in_start, in_iter in_end) { + clear(); + append(in_start, in_end); + } + + void assign(std::initializer_list init_list) { + clear(); + append(init_list); + } + + iterator erase(const_iterator cit) { + // Just cast away constness because this is a non-const member function. + iterator it = const_cast(cit); + iterator n = it; + // Shift all elms down one. + std::move(it + 1, this->end(), it); + // Drop the last elm. + this->pop_back(); + return (n); + } + + iterator erase(const_iterator c_first, const_iterator c_last) { + // Just cast away constness because this is a non-const member function. + iterator first = const_cast(c_first); + iterator last = const_cast(c_last); + iterator n = first; + // Shift all elms down. + iterator it = std::move(last, this->end(), first); + // Drop the last elms. + this->destroy_range(it, this->end()); + this->set_end(it); + return (n); + } + + iterator insert(iterator it, T&& elm) { + if (it == this->end()) { // Important special case for empty vector. + this->push_back(std::move(elm)); + return this->end() - 1; + } + + if (this->m_end_ptr >= this->m_capacity_ptr) { + size_t elm_idx = it - this->begin(); + this->grow(); + it = this->begin() + elm_idx; + } + + new (static_cast(this->end())) T(std::move(this->back())); + // Push everything else over. + std::move_backward(it, this->end() - 1, this->end()); + this->set_end(this->end() + 1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + T* elm_ptr = &elm; + if (it <= elm_ptr && elm_ptr < this->m_end_ptr) + ++elm_ptr; + + *it = std::move(*elm_ptr); + return it; + } + + iterator insert(iterator it, const T& _elm) { + if (it == this->end()) { // Important special case for empty vector. + this->push_back(_elm); + return this->end() - 1; + } + T elm = _elm; + if (this->m_end_ptr >= this->m_capacity_ptr) { + size_t elm_idx = it - this->begin(); + this->grow(); + it = this->begin() + elm_idx; + } + new (static_cast(this->end())) T(std::move(this->back())); + // Push everything else over. + std::move_backward(it, this->end() - 1, this->end()); + this->set_end(this->end() + 1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + const T* elm_ptr = &elm; + if (it <= elm_ptr && elm_ptr < this->m_end_ptr) + ++elm_ptr; + + *it = *elm_ptr; + return it; + } + + iterator insert(iterator it, size_type num_to_insert, const T& _elm) { + // Convert iterator to elm# to avoid invalidating iterator + // when we reserve() + size_t elm_idx = it - this->begin(); + + if (it == this->end()) { // Important special case for empty vector. + append(num_to_insert, _elm); + return this->begin() + elm_idx; + } + + T elm = _elm; + + // Ensure there is enough space. + reserve(this->size() + num_to_insert); + + // Uninvalidate the iterator. + it = this->begin() + elm_idx; + + // If there are more elements between the insertion point and + // the end of the range than there are being inserted, + // we can use a simple approach to insertion. + // Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end() - it) >= num_to_insert) { + T* old_end = this->end(); + append(std::move_iterator(this->end() - num_to_insert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(it, old_end - num_to_insert, old_end); + + std::fill_n(it, num_to_insert, elm); + return it; + } + + // Otherwise, we're inserting more elements than exist already, + // and we're not inserting at the end. + + // Move over the elements that we're about to overwrite. + T* old_end = this->end(); + this->set_end(this->end() + num_to_insert); + size_t num_overwritten = old_end - it; + this->uninitialized_move(it, old_end, this->end() - num_overwritten); + + // Replace the overwritten part. + std::fill_n(it, num_overwritten, elm); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(old_end, num_to_insert - num_overwritten, + elm); + return it; + } + + template < + typename IterType, + typename = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type> + iterator insert(iterator it, IterType from, IterType to) { + // Convert iterator to elm# to avoid invalidating iterator + // when we reserve() + size_t elm_idx = it - this->begin(); + + if (it == this->end()) { // Important special case for empty vector. + append(from, to); + return this->begin() + elm_idx; + } + + size_t num_to_insert = std::distance(from, to); + + // Ensure there is enough space. + reserve(this->size() + num_to_insert); + + // Uninvalidate the iterator. + it = this->begin() + elm_idx; + + // If there are more elements between the insertion point and + // the end of the range than there are being inserted, + // we can use a simple approach to insertion. + // Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end() - it) >= num_to_insert) { + T* old_end = this->end(); + append(std::move_iterator(this->end() - num_to_insert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(it, old_end - num_to_insert, old_end); + + std::copy(from, to, it); + return it; + } + + // Otherwise, we're inserting more elements than exist already, + // and we're not inserting at the end. + + // Move over the elements that we're about to overwrite. + T* old_end = this->end(); + this->set_end(this->end() + num_to_insert); + size_t num_overwritten = old_end - it; + this->uninitialized_move(it, old_end, this->end() - num_overwritten); + + // Replace the overwritten part. + for (T* iter = it; num_overwritten > 0; --num_overwritten) { + *iter = *from; + ++iter; + ++from; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(from, to, old_end); + return it; + } + + void insert(iterator it, std::initializer_list init_list) { + insert(it, init_list.begin(), init_list.end()); + } + + template + void emplace_back(ArgTypes&&... args) { + if (megdnn_unlikely(this->m_end_ptr >= this->m_capacity_ptr)) { + this->grow(); + } + new (static_cast(this->end())) + T(std::forward(args)...); + this->set_end(this->end() + 1); + } + + SmallVectorImpl& operator=(const SmallVectorImpl& rhs); + + SmallVectorImpl& operator=(SmallVectorImpl&& rhs); + + bool operator==(const SmallVectorImpl& rhs) const { + if (this->size() != rhs.size()) + return false; + return std::equal(this->begin(), this->end(), rhs.begin()); + } + + bool operator!=(const SmallVectorImpl& rhs) const { + return !(*this == rhs); + } + + bool operator<(const SmallVectorImpl& rhs) const { + return std::lexicographical_compare(this->begin(), this->end(), + rhs.begin(), rhs.end()); + } +}; + +template +void SmallVectorImpl::swap(SmallVectorImpl& rhs) { + if (this == &rhs) + return; + + // We can only avoid copying elements if neither vector is small. + if (!this->is_small() && !rhs.is_small()) { + std::swap(this->m_begin_ptr, rhs.m_begin_ptr); + std::swap(this->m_end_ptr, rhs.m_end_ptr); + std::swap(this->m_capacity_ptr, rhs.m_capacity_ptr); + return; + } + if (rhs.size() > this->capacity()) + this->grow(rhs.size()); + if (this->size() > rhs.capacity()) + rhs.grow(this->size()); + + // Swap the shared elements. + size_t num_shared = this->size(); + if (num_shared > rhs.size()) + num_shared = rhs.size(); + for (size_type i = 0; i != num_shared; ++i) + std::swap((*this)[i], rhs[i]); + + // Copy over the extra elms. + if (this->size() > rhs.size()) { + size_t elm_diff = this->size() - rhs.size(); + this->uninitialized_move(this->begin() + num_shared, this->end(), + rhs.end()); + rhs.set_end(rhs.end() + elm_diff); + this->destroy_range(this->begin() + num_shared, this->end()); + this->set_end(this->begin() + num_shared); + } else if (rhs.size() > this->size()) { + size_t elm_diff = rhs.size() - this->size(); + this->uninitialized_move(rhs.begin() + num_shared, rhs.end(), + this->end()); + this->set_end(this->end() + elm_diff); + this->destroy_range(rhs.begin() + num_shared, rhs.end()); + rhs.set_end(rhs.begin() + num_shared); + } +} + +template +SmallVectorImpl& SmallVectorImpl::operator=( + const SmallVectorImpl& rhs) { + if (this == &rhs) + return *this; + size_t rhs_sz = rhs.size(); + size_t cur_sz = this->size(); + if (cur_sz >= rhs_sz) { + iterator new_end; + if (rhs_sz) { + new_end = std::copy(rhs.begin(), rhs.end(), this->begin()); + } else { + new_end = this->begin(); + } + this->destroy_range(new_end, this->end()); + this->set_end(new_end); + return *this; + } + if (this->capacity() < rhs_sz) { + // save time for no copy when growing + this->destroy_range(this->begin(), this->end()); + this->set_end(this->begin()); + cur_sz = 0; + this->grow(rhs_sz); + } else if (cur_sz) { + std::copy(rhs.begin(), rhs.begin() + cur_sz, this->begin()); + } + std::uninitialized_copy(rhs.begin() + cur_sz, rhs.end(), + this->begin() + cur_sz); + this->set_end(this->begin() + rhs_sz); + return *this; +} + +template +SmallVectorImpl& SmallVectorImpl::operator=(SmallVectorImpl&& rhs) { + // avoid self assignment + if (this == &rhs) + return *this; + + // copy ptr when rhs is small + if (!rhs.is_small()) { + this->destroy_range(this->begin(), this->end()); + if (!this->is_small()) + free(this->begin()); + this->m_begin_ptr = rhs.m_begin_ptr; + this->m_end_ptr = rhs.m_end_ptr; + this->m_capacity_ptr = rhs.m_capacity_ptr; + rhs.reset_to_small(); + return *this; + } + + size_t rhs_sz = rhs.size(); + size_t cur_sz = this->size(); + if (cur_sz >= rhs_sz) { + iterator new_end = this->begin(); + if (rhs_sz) { + new_end = std::move(rhs.begin(), rhs.end(), new_end); + } + this->destroy_range(new_end, this->end()); + this->set_end(new_end); + rhs.clear(); + return *this; + } + if (this->capacity() < rhs_sz) { + this->destroy_range(this->begin(), this->end()); + this->set_end(this->begin()); + cur_sz = 0; + this->grow(rhs_sz); + } else if (cur_sz) { + std::move(rhs.begin(), rhs.begin() + cur_sz, this->begin()); + } + + this->uninitialized_move(rhs.begin() + cur_sz, rhs.end(), + this->begin() + cur_sz); + + this->set_end(this->begin() + rhs_sz); + + rhs.clear(); + return *this; +} +template +struct SmallVectorStorage { + typename SmallVectorTemplateCommon::U inline_elms[N - 1]; +}; +template +struct SmallVectorStorage {}; +template +struct SmallVectorStorage {}; + +/*! + * \brief This is a 'vector' (really, a variable-sized array), optimized for the + * case when the array is small. + * + * It contains some number of elements in-place, + * which allows it to avoid heap allocation when the actual number of elements + * is below that threshold. This allows normal "small" cases to be fast without + * losing generality for large inputs. + * + * Note that this does not attempt to be exception safe. + * + * SmallVector& can be converted to SmallVectorImpl& to erase the + * template param \p N; this is useful for function params. + * + * \tparam T emelment type + * \tparam N number of elements to be stored in the class object + */ +template +class SmallVector : public SmallVectorImpl { + SmallVectorStorage m_storage; + +public: + SmallVector() : SmallVectorImpl(N) {} + + explicit SmallVector(size_t size, const T& value = T()) + : SmallVectorImpl(N) { + this->assign(size, value); + } + + template < + typename IterType, + typename = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type> + SmallVector(IterType first, IterType last) : SmallVectorImpl(N) { + this->append(first, last); + } + + SmallVector(std::initializer_list init_list) : SmallVectorImpl(N) { + this->assign(init_list); + } + + SmallVector(const SmallVector& rhs) : SmallVectorImpl(N) { + if (!rhs.empty()) + SmallVectorImpl::operator=(rhs); + } + + ~SmallVector() {} + + const SmallVector& operator=(const SmallVector& rhs) { + SmallVectorImpl::operator=(rhs); + return *this; + } + + SmallVector(SmallVector&& rhs) : SmallVectorImpl(N) { + if (!rhs.empty()) + SmallVectorImpl::operator=(std::move(rhs)); + } + + SmallVector(SmallVectorImpl&& rhs) : SmallVectorImpl(N) { + if (!rhs.empty()) + SmallVectorImpl::operator=(std::move(rhs)); + } + + const SmallVector& operator=(SmallVector&& rhs) { + SmallVectorImpl::operator=(std::move(rhs)); + return *this; + } + + const SmallVector& operator=(SmallVectorImpl&& rhs) { + SmallVectorImpl::operator=(std::move(rhs)); + return *this; + } + + const SmallVector& operator=(std::initializer_list init_list) { + this->assign(init_list); + return *this; + } +}; + +template +static inline size_t capacity_in_bytes(const SmallVector& vec) { + return vec.capacity_in_bytes(); +} + +template +inline typename SmallVectorImpl::const_iterator find( + const SmallVectorImpl& vec, const T& value) { + return vec.find(vec.begin(), vec.end(), value); +} + +} // end namespace megdnn + +#include "megdnn/internal/visibility_epilogue.h" + +namespace std { + +/// Implement std::swap in terms of SmallVector swap. +template +inline void swap(megdnn::SmallVectorImpl& lhs, + megdnn::SmallVectorImpl& rhs) { + lhs.swap(rhs); +} + +/// Implement std::swap in terms of SmallVector swap. +template +inline void swap(megdnn::SmallVector& lhs, + megdnn::SmallVector& rhs) { + lhs.swap(rhs); +} +} // end namespace std + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/include/megdnn/version.h b/dnn/include/megdnn/version.h new file mode 100644 index 00000000..bd800993 --- /dev/null +++ b/dnn/include/megdnn/version.h @@ -0,0 +1,30 @@ +/** + * \file dnn/include/megdnn/version.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#define MEGDNN_MAJOR 9 +#define MEGDNN_MINOR 3 +#define MEGDNN_PATCH 0 + +#include "megdnn/internal/visibility_prologue.h" + +namespace megdnn { + struct Version { + int major, minor, patch; + }; + + //! get megdnn version of the binary + Version get_version(); +} + +#include "megdnn/internal/visibility_epilogue.h" + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/scripts/Makefile b/dnn/scripts/Makefile new file mode 100644 index 00000000..f21cd594 --- /dev/null +++ b/dnn/scripts/Makefile @@ -0,0 +1,45 @@ +PARAM_DEFS := ../include/megdnn/opr_param_defs.h \ + ../include/megdnn/opr_param_json.h \ + ../src/common/opr_param_defs_enumv.cuh \ + ../src/common/elemwise/each_mode.inl + +ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \ + ../src/cuda/elemwise/special_kimpl \ + ../src/cuda/elemwise/kimpl \ + ../src/naive/elemwise/kimpl \ + ../src/cuda/elemwise_multi_type/kimpl + +CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl + +all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} + +../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py + ./$^ $@ + +../src/cuda/cond_take/kimpl: gen_cond_take_kern_impls.py + ./$^ --type cuda $@ + +../src/cuda/elemwise/special_kimpl: gen_elemwise_special_kern_impls.py + ./$^ --type cuda $@ + + +../src/cuda/elemwise/kimpl: gen_elemwise_kern_impls.py + ./$^ --type cuda $@ + + +../src/%/elemwise/kimpl: gen_elemwise_kern_impls.py + ./$^ $@ + +../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py + ./$^ --type cuda $@ + +../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py + ./$^ --type dp4a $@ + +../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py + ./$^ --type imma $@ + +../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py + ./$^ --type dp4a $@ + +.PHONY: all diff --git a/dnn/scripts/gen_cond_take_kern_impls.py b/dnn/scripts/gen_cond_take_kern_impls.py new file mode 100755 index 00000000..e06add1b --- /dev/null +++ b/dnn/scripts/gen_cond_take_kern_impls.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse +from gen_elemwise_utils import DTYPES + +def main(): + parser = argparse.ArgumentParser( + description='generate elemwise impl files', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--type', type=str, choices=['cuda'], + default='cuda', + help='generate cuda cond take kernel file') + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + if not os.path.isdir(args.output): + os.makedirs(args.output) + + assert args.type =='cuda' + cpp_ext = 'cu' + + for dtype in DTYPES.keys(): + fname = '{}.{}'.format(dtype, cpp_ext) + fname = os.path.join(args.output, fname) + with open(fname, 'w') as fout: + w = lambda s: print(s, file=fout) + + w('// generated by gen_cond_take_kern_impls.py') + w('#include "../kern.inl"') + w('') + if dtype == 'dt_float16': + w('#if !MEGDNN_DISABLE_FLOAT16') + w('namespace megdnn {') + w('namespace cuda {') + w('namespace cond_take {') + w('') + + w('inst_genidx(::megdnn::dtype::{})'.format(DTYPES[dtype][0])) + w('#undef inst_genidx') + w('') + w('inst_copy(::megdnn::dtype::{})'.format(DTYPES[dtype][0])) + w('#undef inst_copy') + w('#undef inst_copy_') + + w('') + w('} // cond_take') + w('} // cuda') + w('} // megdnn') + if dtype == 'dt_float16': + w('#endif') + + print('generated {}'.format(fname)) + + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_cuda_batch_conv_bias_kern_impls.py b/dnn/scripts/gen_cuda_batch_conv_bias_kern_impls.py new file mode 100755 index 00000000..2d71b02e --- /dev/null +++ b/dnn/scripts/gen_cuda_batch_conv_bias_kern_impls.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse +import itertools + +PREFIXES = {"dp4a": [("batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4", True), ("batch_conv_bias_int8_gemm_ncdiv4hw4", False), ("batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128", False)]} + +ACTIVATIONS = {1: ("IDENTITY", "_id"), + 2: ("RELU", "_relu"), + 3: ("H_SWISH", "_hswish")} + +BIASES = {1: ("PerElementBiasVisitor", "_per_elem"), + 2: ("PerChannelBiasVisitor", "_per_chan")} + +SUFFIXES = {"dp4a": [""], + "imma": [""]} + +def main(): + parser = argparse.ArgumentParser( + description='generate cuda batch conv bias (dp4a/imma) kern impl files', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--type', type=str, choices=['dp4a', + 'imma'], + default='dp4a', help='generate cuda conv bias kernel file') + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + if not os.path.isdir(args.output): + os.makedirs(args.output) + + + inst = ''' +template void megdnn::cuda::batch_conv_bias::do_PREFIXSUFFIX>>( + const int8_t* d_src, + const int8_t* d_filter, WORKSPACE + BIAS bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream);''' + + for prefix in PREFIXES[args.type]: + for suffix in SUFFIXES[args.type]: + for _, act in ACTIVATIONS.items(): + has_workspace = prefix[1] + bias = BIASES[2] + fname = "{}{}{}{}.cu".format(prefix[0], suffix, bias[1], act[1]) + fname = os.path.join(args.output, fname) + with open(fname, "w") as fout: + w = lambda s: print(s, file=fout) + w('// generated by gen_batch_cuda_conv_bias_kern_impls.py') + cur_inst = inst.replace("PREFIX", prefix[0]).replace("SUFFIX", suffix).replace("BIAS", bias[0]).replace("ACTIVATION", act[0]) + if has_workspace: + cur_inst = cur_inst.replace("WORKSPACE", "\nint* d_workspace, ") + else: + cur_inst = cur_inst.replace("WORKSPACE", "") + w('#include "../{}{}.cuinl"'.format(prefix[0], suffix)) + w(cur_inst) + + print('generated {}'.format(fname)) + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_cuda_conv_bias_kern_impls.py b/dnn/scripts/gen_cuda_conv_bias_kern_impls.py new file mode 100755 index 00000000..b2065f81 --- /dev/null +++ b/dnn/scripts/gen_cuda_conv_bias_kern_impls.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse +import itertools + +PREFIXES = {"dp4a": "conv_bias_int8_implicit_gemm_cdiv4hwn4", "imma": "conv_bias_int8_implicit_gemm"} + +ACTIVATIONS = {1: ("IDENTITY", "_id"), + 2: ("RELU", "_relu"), + 3: ("H_SWISH", "_hswish")} + +BIASES = {1: ("PerElementBiasVisitor", "_per_elem"), + 2: ("PerChannelBiasVisitor", "_per_chan")} + +SUFFIXES = {"dp4a": ["", "_ld_64bit", "_ld_64bit_unroll_width", "_unroll_width"], + "imma": ["_imma16x16x16_cdiv4hwn4", "_imma8x32x16_cdiv4hwn4", "_imma32x8x16_cdiv4hwn4", + "_imma16x16x16_cdiv4hwn4_reorder_filter", "_imma8x32x16_cdiv4hwn4_reorder_filter", "_imma32x8x16_cdiv4hwn4_reorder_filter", + "_imma16x16x16_cdiv4hwn4_unroll_width", "_imma8x32x16_cdiv4hwn4_unroll_width", "_imma32x8x16_cdiv4hwn4_unroll_width"]} + +def main(): + parser = argparse.ArgumentParser( + description='generate cuda conv bias (dp4a/imma) kern impl files', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--type', type=str, choices=['dp4a', + 'imma'], + default='dp4a', help='generate cuda conv bias kernel file') + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + if not os.path.isdir(args.output): + os.makedirs(args.output) + + + inst = ''' +template void megdnn::cuda::conv_bias_int8::do_PREFIXSUFFIX>>( + const int8_t* d_src, + const int8_t* d_filter, + BIAS bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream);''' + + for suffix in SUFFIXES[args.type]: + for _, act in ACTIVATIONS.items(): + prefix = PREFIXES[args.type] + bias = BIASES[2] + fname = "{}{}{}{}.cu".format(prefix, suffix, bias[1], act[1]) + fname = os.path.join(args.output, fname) + with open(fname, "w") as fout: + w = lambda s: print(s, file=fout) + w('// generated by gen_cuda_conv_bias_kern_impls.py') + cur_inst = inst.replace("PREFIX", prefix).replace("SUFFIX", suffix).replace("BIAS", bias[0]).replace("ACTIVATION", act[0]) + w('#include "../{}{}.cuinl"'.format(prefix, suffix)) + w(cur_inst) + + print('generated {}'.format(fname)) + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_elemwise_each_mode.py b/dnn/scripts/gen_elemwise_each_mode.py new file mode 100755 index 00000000..d7fc1beb --- /dev/null +++ b/dnn/scripts/gen_elemwise_each_mode.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse + +from gen_elemwise_utils import ARITIES, MODES + +def main(): + parser = argparse.ArgumentParser( + description='generate elemwise each mode', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + + with open(args.output, 'w') as fout: + w = lambda s: print(s, file=fout) + w('// generated by gen_elemwise_each_mode.py') + keys = list(MODES.keys()) + keys.sort() + for (anum, ctype) in keys: + w('#define MEGDNN_FOREACH_ELEMWISE_MODE_{}_{}(cb) \\'.format( + ARITIES[anum], ctype)) + for mode in MODES[(anum, ctype)]: + w(' MEGDNN_ELEMWISE_MODE_ENABLE({}, cb) \\'.format(mode)) + w('') + + print('generated each_mode.inl') + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_elemwise_kern_impls.py b/dnn/scripts/gen_elemwise_kern_impls.py new file mode 100755 index 00000000..30972567 --- /dev/null +++ b/dnn/scripts/gen_elemwise_kern_impls.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse +import itertools +from gen_elemwise_utils import ARITIES, DTYPES, MODES + +def main(): + parser = argparse.ArgumentParser( + description='generate elemwise impl files', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--type', type=str, choices=['cuda', + 'cpp'], + default='cpp', help='generate cuda/hip kernel file') + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + if not os.path.isdir(args.output): + os.makedirs(args.output) + + if args.type == 'cuda': + cpp_ext = 'cu' + else: + assert args.type == 'cpp' + cpp_ext = 'cpp' + + for anum, ctype in itertools.product(ARITIES.keys(), DTYPES.keys()): + for mode in MODES[(anum, DTYPES[ctype][1])]: + formode = 'MEGDNN_ELEMWISE_MODE_ENABLE({}, cb)'.format(mode) + fname = '{}_{}.{}'.format(mode, ctype, cpp_ext) + fname = os.path.join(args.output, fname) + with open(fname, 'w') as fout: + w = lambda s: print(s, file=fout) + w('// generated by gen_elemwise_kern_impls.py') + + if ctype == 'dt_float16': + w('#if !MEGDNN_DISABLE_FLOAT16') + + w('#define KERN_IMPL_MODE(cb) {}'.format(formode)) + w('#define KERN_IMPL_ARITY {}'.format(anum)) + w('#define KERN_IMPL_CTYPE {}'.format(ctype)) + w('#include "../kern_impl.inl"') + + if ctype == 'dt_float16': + w('#endif') + + print('generated {}'.format(fname)) + + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_elemwise_multi_type_kern_impls.py b/dnn/scripts/gen_elemwise_multi_type_kern_impls.py new file mode 100755 index 00000000..0aca3cfd --- /dev/null +++ b/dnn/scripts/gen_elemwise_multi_type_kern_impls.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse +import itertools +from gen_elemwise_multi_type_utils import SUPPORT_DTYPES, MODES, SUPPORT_QINT32_DTYPES, QINT32_MODES + +def generate(modes, support_dtypes, output, cpp_ext): + for anum, ctype in itertools.product(modes.keys(), support_dtypes): + print('{} : {}'.format(anum, ctype)) + src_ctype = ctype[0] + dst_ctype = ctype[1] + for mode in modes[anum]: + formode = 'MEGDNN_ELEMWISE_MODE_ENABLE({}, cb)'.format(mode) + fname = '{}_{}_{}.{}'.format(mode, src_ctype, dst_ctype, cpp_ext) + fname = os.path.join(output, fname) + with open(fname, 'w') as fout: + w = lambda s: print(s, file=fout) + w('// generated by gen_elemwise_multi_type_kern_impls.py') + + w('#define KERN_IMPL_MODE(cb) {}'.format(formode)) + w('#define KERN_IMPL_ARITY {}'.format(anum)) + w('#define KERN_IMPL_STYPE {}'.format(src_ctype)) + w('#define KERN_IMPL_DTYPE {}'.format(dst_ctype)) + w('#include "../kern_impl.inl"') + + print('generated {}'.format(fname)) + + +def main(): + parser = argparse.ArgumentParser( + description='generate elemwise impl files', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--type', type=str, choices=['cuda'], + default='cuda', help='generate cuda kernel file') + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + if not os.path.isdir(args.output): + os.makedirs(args.output) + + assert args.type == 'cuda' + if args.type == 'cuda': + cpp_ext = 'cu' + + generate(MODES, SUPPORT_DTYPES, args.output, cpp_ext) + generate(QINT32_MODES, SUPPORT_QINT32_DTYPES, args.output, cpp_ext) + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_elemwise_multi_type_utils.py b/dnn/scripts/gen_elemwise_multi_type_utils.py new file mode 100755 index 00000000..7279c61f --- /dev/null +++ b/dnn/scripts/gen_elemwise_multi_type_utils.py @@ -0,0 +1,23 @@ +# As cuda currently do not support quint8, so we just ignore it. +SUPPORT_DTYPES = [('dt_qint8', 'dt_qint8')] +SUPPORT_QINT32_DTYPES = [('dt_qint32', 'dt_qint8'), ('dt_qint8', 'dt_qint32')] + +MODES = { + 1: ['RELU', 'ABS', 'NEGATE', 'ACOS', 'ASIN', 'CEIL', 'COS', + 'EXP', 'EXPM1', 'FLOOR', 'LOG', 'LOG1P', 'SIGMOID', 'SIN', + 'TANH', 'FAST_TANH', 'ROUND', 'ERF', 'ERFINV', 'ERFC', + 'ERFCINV', 'H_SWISH'], + 2: ['ABS_GRAD', 'ADD', 'FLOOR_DIV', 'MAX', 'MIN', 'MOD', 'MUL', + 'SIGMOID_GRAD', 'SUB', 'SWITCH_GT0', 'TANH_GRAD', 'LT', + 'LEQ', 'EQ', 'FUSE_ADD_RELU', 'TRUE_DIV', 'POW', + 'LOG_SUM_EXP', 'FUSE_ADD_TANH', 'FAST_TANH_GRAD', + 'FUSE_ADD_SIGMOID', 'ATAN2', 'H_SWISH_GRAD', + 'FUSE_ADD_H_SWISH'], + 3: ['COND_LEQ_MOV', 'FUSE_MUL_ADD3'], +} + +QINT32_MODES = { + 1: ['RELU', 'SIGMOID', 'TANH', 'FAST_TANH', 'H_SWISH'], + 2: ['ADD', 'FUSE_ADD_RELU', 'FUSE_ADD_SIGMOID', + 'FUSE_ADD_TANH', 'FUSE_ADD_H_SWISH'] +} diff --git a/dnn/scripts/gen_elemwise_special_kern_impls.py b/dnn/scripts/gen_elemwise_special_kern_impls.py new file mode 100755 index 00000000..a9c868ae --- /dev/null +++ b/dnn/scripts/gen_elemwise_special_kern_impls.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse +from gen_elemwise_utils import DTYPES + +def main(): + parser = argparse.ArgumentParser( + description='generate elemwise impl files', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--type', type=str, choices=[ + 'cuda', + ], + default='cuda', + help='generate cuda/hip elemwise special kernel file') + parser.add_argument('output', help='output directory') + args = parser.parse_args() + + if not os.path.isdir(args.output): + os.makedirs(args.output) + + if args.type == 'cuda': + cpp_ext = 'cu' + + for dtype in DTYPES.keys(): + fname = 'special_{}.{}'.format(dtype, cpp_ext) + fname = os.path.join(args.output, fname) + with open(fname, 'w') as fout: + w = lambda s: print(s, file=fout) + + w('// generated by gen_elemwise_special_kern_impls.py') + if dtype == 'dt_float16': + w('#if !MEGDNN_DISABLE_FLOAT16') + w('#include "../special_kerns.inl"') + w('INST(::megdnn::dtype::{})'.format(DTYPES[dtype][0])) + w('#undef INST') + w('}') + w('}') + if dtype == 'dt_float16': + w('#endif') + + print('generated {}'.format(fname)) + + os.utime(args.output) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/gen_elemwise_utils.py b/dnn/scripts/gen_elemwise_utils.py new file mode 100755 index 00000000..3a3b04cb --- /dev/null +++ b/dnn/scripts/gen_elemwise_utils.py @@ -0,0 +1,30 @@ + +ARITIES = {1: 'UNARY', 2: 'BINARY', 3: 'TERNARY'} + +DTYPES = {'dt_int32': ('Int32', 'INT'), + 'dt_uint8': ('Uint8', 'INT'), + 'dt_int8': ('Int8', 'INT'), + 'dt_int16': ('Int16', 'INT'), + 'dt_float32': ('Float32', 'FLOAT'), + 'dt_float16': ('Float16', 'FLOAT') + } + +MODES = { + (1, 'INT'): ['RELU', 'ABS', 'NEGATE'], + (2, 'INT'): ['ABS_GRAD', 'ADD', 'FLOOR_DIV', 'MAX', 'MIN', 'MOD', 'MUL', + 'SIGMOID_GRAD', 'SUB', 'SWITCH_GT0', 'TANH_GRAD', 'LT', 'LEQ', + 'EQ', 'FUSE_ADD_RELU', 'SHL', 'SHR', 'RMULH'], + (3, 'INT'): ['COND_LEQ_MOV'], + + (1, 'FLOAT'): ['RELU', 'ABS', 'NEGATE', 'ACOS', 'ASIN', 'CEIL', 'COS', + 'EXP', 'EXPM1', 'FLOOR', 'LOG', 'LOG1P', 'SIGMOID', 'SIN', + 'TANH', 'FAST_TANH', 'ROUND', 'ERF', 'ERFINV', 'ERFC', + 'ERFCINV', 'H_SWISH'], + (2, 'FLOAT'): ['ABS_GRAD', 'ADD', 'FLOOR_DIV', 'MAX', 'MIN', 'MOD', 'MUL', + 'SIGMOID_GRAD', 'SUB', 'SWITCH_GT0', 'TANH_GRAD', 'LT', + 'LEQ', 'EQ', 'FUSE_ADD_RELU', 'TRUE_DIV', 'POW', + 'LOG_SUM_EXP', 'FUSE_ADD_TANH', 'FAST_TANH_GRAD', + 'FUSE_ADD_SIGMOID', 'ATAN2', 'H_SWISH_GRAD', + 'FUSE_ADD_H_SWISH'], + (3, 'FLOAT'): ['COND_LEQ_MOV', 'FUSE_MUL_ADD3'], +} diff --git a/dnn/scripts/gen_flatbuffers_converter.py b/dnn/scripts/gen_flatbuffers_converter.py new file mode 100755 index 00000000..45e806fe --- /dev/null +++ b/dnn/scripts/gen_flatbuffers_converter.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import collections +import textwrap +import os +import hashlib +import struct +import io + +from gen_param_defs import member_defs, ParamDef, IndentWriterBase + +class ConverterWriter(IndentWriterBase): + _skip_current_param = False + _last_param = None + _param_fields = None + _fb_fields = [] + + def __call__(self, fout, defs): + super().__call__(fout) + self._write("// %s", self._get_header()) + self._write('#include ') + self._write("namespace mgb {") + self._write("namespace serialization {") + self._write("namespace fbs {") + self._process(defs) + self._write("} // namespace fbs") + self._write("} // namespace serialization") + self._write("} // namespace mgb") + + def _on_param_begin(self, p): + self._last_param = p + self._param_fields = [] + self._fb_fields = ["builder"] + if p.is_legacy: + self._skip_current_param = True + return + self._write("template<>\nstruct ParamConverter {", + p.name, indent=1) + self._write("using MegDNNType = megdnn::param::%s;", p.name) + self._write("using FlatBufferType = fbs::param::%s;\n", p.name) + + def _on_param_end(self, p): + if self._skip_current_param: + self._skip_current_param = False + return + self._write("static MegDNNType to_param(const FlatBufferType* fb) {", + indent=1) + line = 'return {' + line += ', '.join(self._param_fields) + line += '};' + self._write(line) + self._write("}\n", indent=-1) + + self._write( + "static flatbuffers::Offset to_flatbuffer(flatbuffers::FlatBufferBuilder& builder, const MegDNNType& param) {", + indent=1) + line = 'return fbs::param::Create{}('.format(str(p.name)) + line += ', '.join(self._fb_fields) + line += ');' + self._write(line) + self._write('}', indent=-1) + + self._write("};\n", indent=-1) + + def _on_member_enum(self, e): + p = self._last_param + key = str(p.name) + str(e.name) + if self._skip_current_param: + return + self._param_fields.append( + "static_cast(fb->{}())".format( + str(p.name), str(e.name), e.name_field)) + self._fb_fields.append("static_cast(param.{})".format( + key, e.name_field)) + + def _on_member_field(self, f): + if self._skip_current_param: + return + if f.dtype.cname == 'DTypeEnum': + self._param_fields.append( + "intl::convert_dtype_to_megdnn(fb->{}())".format(f.name)) + self._fb_fields.append( + "intl::convert_dtype_to_fbs(param.{})".format(f.name)) + else: + self._param_fields.append("fb->{}()".format(f.name)) + self._fb_fields.append("param.{}".format(f.name)) + + def _on_const_field(self, f): + pass + + def _on_member_enum_alias(self, e): + if self._skip_current_param: + return + enum_name = e.src_class + e.src_name + self._param_fields.append( + "static_cast(fb->{}())".format( + e.src_class, e.src_name, e.name_field)) + self._fb_fields.append("static_cast(param.{})".format( + enum_name, e.name_field)) + + +def main(): + parser = argparse.ArgumentParser( + 'generate convert functions between FlatBuffers type and MegBrain type') + parser.add_argument('input') + parser.add_argument('output') + args = parser.parse_args() + + with open(args.input) as fin: + inputs = fin.read() + exec(inputs, {'pdef': ParamDef, 'Doc': member_defs.Doc}) + input_hash = hashlib.sha256() + input_hash.update(inputs.encode(encoding='UTF-8')) + input_hash = input_hash.hexdigest() + + writer = ConverterWriter() + with open(args.output, 'w') as fout: + writer.set_input_hash(input_hash)(fout, ParamDef.all_param_defs) + +if __name__ == "__main__": + main() diff --git a/dnn/scripts/gen_flatbuffers_schema.py b/dnn/scripts/gen_flatbuffers_schema.py new file mode 100755 index 00000000..f66040f4 --- /dev/null +++ b/dnn/scripts/gen_flatbuffers_schema.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import collections +import textwrap +import os +import hashlib +import struct +import io + +from gen_param_defs import member_defs, ParamDef, IndentWriterBase + +def _cname_to_fbname(cname): + return { + "uint32_t": "uint", + "uint64_t": "ulong", + "int32_t": "int", + "float": "float", + "double": "double", + "DTypeEnum": "DTypeEnum", + "bool": "bool", + }[cname] + +def scramble_enum_member_name(name): + if name in ("MIN", "MAX"): + return name + "_" + return name + +class FlatBuffersWriter(IndentWriterBase): + _skip_current_param = False + _last_param = None + _enums = None + _used_enum = None + _cur_const_val = {} + + def __call__(self, fout, defs): + param_io = io.StringIO() + super().__call__(param_io) + self._used_enum = set() + self._enums = {} + self._process(defs) + super().__call__(fout) + self._write("// %s", self._get_header()) + self._write('include "dtype.fbs";') + self._write("namespace mgb.serialization.fbs.param;\n") + self._write_enums() + self._write(param_io.getvalue()) + + def _write_enums(self): + for (p, e) in sorted(self._used_enum): + name = p + e + e = self._enums[(p, e)] + self._write_doc(e.name) + self._write("enum %s%s : uint {", p, e.name, indent=1) + for member in e.members: + self._write_doc(member) + self._write("%s,", scramble_enum_member_name(str(member))) + self._write("}\n", indent=-1) + + def _write_doc(self, doc): + if not isinstance(doc, member_defs.Doc) or not doc.doc: return + doc_lines = [] + if doc.no_reformat: + doc_lines = doc.raw_lines + else: + doc = doc.doc.replace('\n', ' ') + text_width = 80 - len(self._cur_indent) - 4 + doc_lines = textwrap.wrap(doc, text_width) + for line in doc_lines: + self._write("/// " + line) + + def _on_param_begin(self, p): + self._last_param = p + self._cur_const_val = {} + if p.is_legacy: + self._skip_current_param = True + return + self._write_doc(p.name) + self._write("table %s {", p.name, indent=1) + + def _on_param_end(self, p): + if self._skip_current_param: + self._skip_current_param = False + return + self._write("}\n", indent=-1) + + def _on_member_enum(self, e): + p = self._last_param + key = str(p.name), str(e.name) + self._enums[key] = e + if self._skip_current_param: + return + self._write_doc(e.name) + self._used_enum.add(key) + self._write("%s:%s%s = %s;", e.name_field, p.name, e.name, + scramble_enum_member_name(str(e.members[e.default]))) + + def _resolve_const(self, v): + while v in self._cur_const_val: + v = self._cur_const_val[v] + return v + + def _on_member_field(self, f): + if self._skip_current_param: + return + self._write_doc(f.name) + self._write("%s:%s = %s;", f.name, _cname_to_fbname(f.dtype.cname), + self._get_fb_default(self._resolve_const(f.default))) + + def _on_const_field(self, f): + self._cur_const_val[str(f.name)] = str(f.default) + + def _on_member_enum_alias(self, e): + if self._skip_current_param: + return + self._used_enum.add((e.src_class, e.src_name)) + enum_name = e.src_class + e.src_name + self._write( + "%s:%s = %s;", e.name_field, enum_name, + scramble_enum_member_name(str(e.src_enum.members[e.get_default()]))) + + def _get_fb_default(self, cppdefault): + if not isinstance(cppdefault, str): + return cppdefault + + d = cppdefault + if d.endswith('f'): # 1.f + return d[:-1] + if d.endswith('ull'): + return d[:-3] + if d.startswith("DTypeEnum::"): + return d[11:] + return d + + +def main(): + parser = argparse.ArgumentParser( + 'generate FlatBuffers schema of operator param from description file') + parser.add_argument('input') + parser.add_argument('output') + args = parser.parse_args() + + with open(args.input) as fin: + inputs = fin.read() + exec(inputs, {'pdef': ParamDef, 'Doc': member_defs.Doc}) + input_hash = hashlib.sha256() + input_hash.update(inputs.encode(encoding='UTF-8')) + input_hash = input_hash.hexdigest() + + writer = FlatBuffersWriter() + with open(args.output, 'w') as fout: + writer.set_input_hash(input_hash)(fout, ParamDef.all_param_defs) + +if __name__ == "__main__": + main() diff --git a/dnn/scripts/gen_heuristic/gen_heuristic.py b/dnn/scripts/gen_heuristic/gen_heuristic.py new file mode 100755 index 00000000..f5579e65 --- /dev/null +++ b/dnn/scripts/gen_heuristic/gen_heuristic.py @@ -0,0 +1,160 @@ +#! /usr/local/env python3 + +import pickle +import numpy as np +import os +import argparse +import re +import collections + +def define_template(**kwargs): + template = ''' + float cuda{cuda_arch}_{conv_type}_time_pred[{out_dim}] = {{0.0f}}; + float cuda{cuda_arch}_{conv_type}_mask[{out_dim}] = {{0.0f}}; + float cuda{cuda_arch}_{conv_type}_hidden_units[{hidden_num}] = {{0.0f}}; + const static size_t cuda{cuda_arch}_{conv_type}_layers_dim[{layer_num}] = {{{layers_dim}}}; + const static float cuda{cuda_arch}_{conv_type}_matrices[{matrices_dim}] = {{{matrices}}}; + const static float cuda{cuda_arch}_{conv_type}_biases[{biases_dim}] = {{{biases}}}; + const static float cuda{cuda_arch}_{conv_type}_alpha[{out_dim}] = {{{alpha}}}; + const static float cuda{cuda_arch}_{conv_type}_beta[{out_dim}] = {{{beta}}}; + ''' + return template.format(**kwargs) + +def cudnn_slt_template(**kwargs): + template = ("#if CUDNN_MAJOR == {cudnn_major} && CUDNN_MINOR == {cudnn_minor}\n" + + " {define_cmd}\n" + + " {select_cmd}\n" + + " return true;\n" + + "#endif\n" + ) + return template.format(**kwargs) + +def select_template(**kwargs): + template = \ + '''if (conv_type == ConvolutionType::{conv_type} && cuda_major == {cuda_major} && + cuda_minor == {cuda_minor}) {{ + *layer_num_p = {layer_num}; + *hidden_units_p = cuda{cuda_arch}_{conv_type}_hidden_units; + *layers_dim_p = cuda{cuda_arch}_{conv_type}_layers_dim; + *matrices_p = cuda{cuda_arch}_{conv_type}_matrices; + *biases_p = cuda{cuda_arch}_{conv_type}_biases; + *alpha_p = cuda{cuda_arch}_{conv_type}_alpha; + *beta_p = cuda{cuda_arch}_{conv_type}_beta; + *time_pred_p = cuda{cuda_arch}_{conv_type}_time_pred; + *mask_p = cuda{cuda_arch}_{conv_type}_mask; + }} else ''' + return template.format(**kwargs) + + +def main(): + fill_src() + + +def fill_src(): + home = os.path.dirname(__file__) + matrix_files = os.listdir(os.path.join(home, "params")) + gen_list = collections.defaultdict(list) + cudnn_slt_cmd = "" + if len(matrix_files) == 0: + print("Warning: no param files detected.") + for fpath in matrix_files: + cudnn_version = re.findall('cudnn([\d.]+)',fpath)[0] + gen_list[cudnn_version].append(fpath) + for cudnn in gen_list: + select_cmd = ("{\n" + + " " * 8 + "return false;\n" + + " " * 4 + "}") + define_cmd = "" + cudnn_major, cudnn_minor = cudnn.split('.') + for fpath in gen_list[cudnn]: + cuda_arch = fpath.split("-")[1].replace(".", "_") + print('cudnn_version: {}, cuda_arch: {}'.format(cudnn,cuda_arch)) + conv_type = fpath.split("-")[2].split(".")[0] + with open(os.path.join(home, "params/{}".format(fpath)), "rb") as pobj: + params = pickle.load(pobj) + crt_define_cmd, crt_select_cmd = gen_cmds( + cuda_arch, conv_type, params) + select_cmd = crt_select_cmd + select_cmd + define_cmd = crt_define_cmd + define_cmd + + cudnn_slt_cmd += cudnn_slt_template(cudnn_major=cudnn_major, + cudnn_minor=cudnn_minor, + select_cmd=select_cmd, + define_cmd=define_cmd) + + #select_cmd = select_cmd + with open(os.path.join(home, "get_params.template"), "r") as srcf: + src = srcf.read() + dst = src.replace("{cudnn_select}", cudnn_slt_cmd) + MegDNN_path = os.path.join(home, "../..") + with open(os.path.join(MegDNN_path, + "src/cuda/convolution/get_params.cpp"), "w") as dstf: + dstf.write(dst) + + +def gen_cmds(cuda_arch, conv_type, params): + cuda_major, cuda_minor = cuda_arch.split("_") + alphastr = format_array(params['alpha']).rstrip()[:-1] + betastr = format_array(params['beta']).rstrip()[:-1] + W_list = params['W'] + b_list = params['b'] + Wstr = '' + bstr = '' + layer_num = str(len(b_list) + 1) + layers_dim = [W_list[0].shape[1]] + matrices_dim = 0 + biases_dim = 0 + for W in W_list: + Wstr += format_array(W) + matrices_dim += W.shape[0] * W.shape[1] + for b in b_list: + bstr += format_array(b) + layers_dim.append(b.shape[0]) + biases_dim += b.shape[0] + Wstr = Wstr.rstrip()[:-1] + bstr = bstr.rstrip()[:-1] + + hidden_num = sum(layers_dim[1:-1]) + out_dim = layers_dim[-1] + layers_dim_str = format_array(np.array(layers_dim)).rstrip()[:-1] + + select_cmd = select_template(conv_type=conv_type.upper(), cuda_major=cuda_major, + cuda_minor=cuda_minor, layer_num=layer_num, + cuda_arch=cuda_arch) + define_cmd = define_template(cuda_arch=cuda_arch, conv_type=conv_type.upper(), + hidden_num=hidden_num, + layer_num=layer_num, out_dim=out_dim, + layers_dim=layers_dim_str, + matrices_dim=matrices_dim, matrices=Wstr, + biases_dim=biases_dim, biases=bstr, + alpha=alphastr, beta=betastr) + return (define_cmd, select_cmd) + + +def format_array(array): + flat_array = np.squeeze(array.reshape(1, -1)) + array_str = "" + ind = 0 + if flat_array.dtype == "int": + for ind in range(len(flat_array)): + array_str += str(flat_array[ind]) + ", " + else: + for ind in range(len(flat_array)): + if ind % 4 == 0: + array_str += "\n" + " " * 12 + ele = flat_array[ind] + if abs(ele) < 1.0e-37: + array_str += "0.0, " + else: + array_str += "{:.6e}, ".format(ele) + return array_str + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate cuDNN heuristic code by neural network into" + " {MEGDNN_ROOT}/src/cuda/convolution/get_params.cpp," + " using parameter value from pickle files in" + " {MEGDNN_ROOT}/scripts/gen_heuristic/params/") + args = parser.parse_args() + main() diff --git a/dnn/scripts/gen_heuristic/get_params.template b/dnn/scripts/gen_heuristic/get_params.template new file mode 100644 index 00000000..7abbb8fc --- /dev/null +++ b/dnn/scripts/gen_heuristic/get_params.template @@ -0,0 +1,31 @@ +#include "src/cuda/convolution/cudnn_heuristic.h" +#include "megdnn.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool convolution::heuristic_params_available( + int cuda_major, int cuda_minor, size_t* layer_num_p, + const size_t** layers_dim_p, const float** matrices_p, + const float** biases_p, const float** alpha_p, const float** beta_p, + const ConvolutionType& conv_type, float** hidden_units_p, + float** time_pred_p, float** mask_p) { + MEGDNN_MARK_USED_VAR(cuda_major); + MEGDNN_MARK_USED_VAR(cuda_minor); + MEGDNN_MARK_USED_VAR(layer_num_p); + MEGDNN_MARK_USED_VAR(layers_dim_p); + MEGDNN_MARK_USED_VAR(matrices_p); + MEGDNN_MARK_USED_VAR(biases_p); + MEGDNN_MARK_USED_VAR(alpha_p); + MEGDNN_MARK_USED_VAR(beta_p); + MEGDNN_MARK_USED_VAR(conv_type); + MEGDNN_MARK_USED_VAR(hidden_units_p); + MEGDNN_MARK_USED_VAR(time_pred_p); + MEGDNN_MARK_USED_VAR(mask_p); + +{cudnn_select} + return false; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-backward_data.pickle b/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-backward_data.pickle new file mode 100644 index 00000000..c3c11468 Binary files /dev/null and b/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-backward_data.pickle differ diff --git a/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-backward_filter.pickle b/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-backward_filter.pickle new file mode 100644 index 00000000..8d4e28fb Binary files /dev/null and b/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-backward_filter.pickle differ diff --git a/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-forward.pickle b/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-forward.pickle new file mode 100644 index 00000000..c4f88a60 Binary files /dev/null and b/dnn/scripts/gen_heuristic/params/cudnn5.1-5_2-forward.pickle differ diff --git a/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-backward_data.pickle b/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-backward_data.pickle new file mode 100644 index 00000000..bce0618a Binary files /dev/null and b/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-backward_data.pickle differ diff --git a/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-backward_filter.pickle b/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-backward_filter.pickle new file mode 100644 index 00000000..098cfb83 Binary files /dev/null and b/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-backward_filter.pickle differ diff --git a/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-forward.pickle b/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-forward.pickle new file mode 100644 index 00000000..4b95ba98 Binary files /dev/null and b/dnn/scripts/gen_heuristic/params/cudnn6.0-5_2-forward.pickle differ diff --git a/dnn/scripts/gen_param_defs.py b/dnn/scripts/gen_param_defs.py new file mode 100755 index 00000000..ca388f53 --- /dev/null +++ b/dnn/scripts/gen_param_defs.py @@ -0,0 +1,808 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import collections +import textwrap +import os +import hashlib +import struct + +class member_defs: + """contain classes to define members of an opr param""" + + Dtype = collections.namedtuple('Dtype', ['cname', 'pycvt', 'pyfmt', + 'cppjson', 'cname_attr']) + Dtype.__new__.__defaults__ = ('', ) + uint32 = Dtype('uint32_t', 'int', 'I', 'NumberInt') + uint64 = Dtype('uint64_t', 'int', 'Q', 'NumberInt', + 'alignas(sizeof(uint64_t)) ') + int32 = Dtype('int32_t', 'int', 'i', 'NumberInt') + float32 = Dtype('float', 'float', 'f', 'Number') + float64 = Dtype('double', 'float', 'd', 'Number') + dtype = Dtype('DTypeEnum', '_as_dtype_num', 'I', 'Number') + bool = Dtype('bool', 'bool', '?', 'Bool') + + class Base: + pass + + + class Doc: + """wrap an identifier to associate document + + note: if the doc starts with a linebreak, it would not be reforamtted. + """ + __slots__ = ['id', 'doc'] + + def __init__(self, id_, doc): + assert isinstance(id_, str) and isinstance(doc, str), (id_, doc) + self.id = id_ + self.doc = doc + + @property + def no_reformat(self): + """whether reformat is disallowed for this doc string""" + return self.doc.startswith('\n') + + @property + def raw_lines(self): + """the doc lines when ``no_format`` is true""" + ret = self.doc.split('\n') + assert not ret[0] + return ret[1:] + + @classmethod + def make(cls, v): + """make doc object from str or doc""" + if isinstance(v, cls): + return v + assert isinstance(v, str) + return cls(v, '') + + def __str__(self): + return self.id + + def __eq__(self, rhs): + if isinstance(rhs, str): + return self.id == rhs + return (isinstance(rhs, Doc) and + (self.id, self.doc) == (rhs.id, rhs.doc)) + + + class Enum(Base): + """define an enum; the result would contain both an enum class def and its + corresponding data field + + :param default: index of default member value + + :attr name_field: name of the data field of this enum in the param + struct + :attr member_alias: list of (member, alias) pairs + """ + __slots__ = ['name', 'name_field', 'members', 'default', + 'member_alias'] + + all_enums = {} + """(param_name, name) => enum""" + + def __init__(self, param_name, name, name_field, members, default, + member_alias): + name = member_defs.Doc.make(name) + assert name.id[0].isupper() + members = tuple(map(member_defs.Doc.make, members)) + if isinstance(default, str): + if default not in name_field: + raise ValueError( + "Default value '{}' does not exist.".format(default)) + default = name_field.index(default) + assert isinstance(default, int) + self.name = name + self.name_field = self.get_name_field(name.id, name_field) + self.members = members + self.default = default + + self.all_enums[(param_name, name.id)] = self + + assert isinstance(member_alias, list) + self.member_alias = member_alias + + @classmethod + def get_name_field(cls, name, name_field): + if name_field is None: + name_field = name[0].lower() + name[1:] + assert isinstance(name_field, str) + return name_field + + class Field(Base): + """define a normal data field""" + __slots__ = ['name', 'dtype', 'default'] + + def __init__(self, name, dtype, default): + assert isinstance(dtype, member_defs.Dtype) + self.name = member_defs.Doc.make(name) + self.dtype = dtype + self.default = default + + class Const(Base): + """define a const data field""" + __slots__ = ['name', 'dtype', 'default'] + + def __init__(self, name, dtype, default): + assert isinstance(dtype, member_defs.Dtype) + self.name = member_defs.Doc.make(name) + self.dtype = dtype + self.default = default + + class EnumAlias(Base): + """alias of enum type from another param""" + __slots__ = ['name', 'name_field', 'src_class', 'src_name', 'default'] + + def __init__(self, name, name_field, src_class, src_name, default): + self.name = name + self.name_field = member_defs.Enum.get_name_field(name, name_field) + self.src_class = src_class + if src_name is None: + src_name = name + self.src_name = src_name + self.default = default + + @property + def src_enum(self): + """source Enum class""" + return member_defs.Enum.all_enums[(self.src_class, self.src_name)] + + def get_default(self): + """get default index; fallback to src index if default is not + set""" + if self.default is None: + return self.src_enum.default + return self.default + + +class ParamDef: + """""" + __all_tags = set() + all_param_defs = [] + + __slots__ = ['name', 'members', 'tag', 'is_legacy'] + + def __init__(self, name, doc='', *, version=0, is_legacy=False): + self.members = [] + self.all_param_defs.append(self) + h = hashlib.sha256(name.encode('utf-8')) + if version: + h.update(struct.pack(' 0: + self._indent() + + +class PyWriter(IndentWriterBase): + FieldDef = collections.namedtuple( + 'FieldDef', ['name', 'cvt', 'fmt', 'default', 'type', 'doc']) + # see _on_param_end() for the use of those fields + + _cur_param_name = None + _cur_fields = None + _cur_struct_fmt = None + + _enum_member2num = None + + def __call__(self, fout, defs): + super().__call__(fout) + self._enum_member2num = [] + self._write('# %s', self._get_header()) + self._write('import struct') + self._write('from . import enum36 as enum') + self._write( + 'class _ParamDefBase:\n' + ' def serialize(self):\n' + ' tag = struct.pack("I", type(self).TAG)\n' + ' pdata = [getattr(self, i) for i in self.__slots__]\n' + ' for idx, v in enumerate(pdata):\n' + ' if isinstance(v, _EnumBase):\n' + ' pdata[idx] = _enum_member2num[id(v)]\n' + ' return tag + self._packer.pack(*pdata)\n' + '\n' + ) + self._write( + 'class _EnumBase(enum.Enum):\n' + ' @classmethod\n' + ' def __normalize(cls, val):\n' + ' if isinstance(val, str):\n' + ' if not hasattr(cls, "__member_upper_dict__"):\n' + ' cls.__member_upper_dict__ = {k.upper(): v\n' + ' for k, v in cls.__members__.items()}\n' + ' val = cls.__member_upper_dict__.get(val.upper(),val)\n' + ' return val\n' + ' @classmethod\n' + ' def convert(cls, val):\n' + ' val = cls.__normalize(val)\n' + ' if isinstance(val, cls):\n' + ' return val\n' + ' return cls(val)\n' + ' @classmethod\n' + ' def _missing_(cls, value):\n' + ' vnorm = cls.__normalize(value)\n' + ' if vnorm is not value:\n' + ' return cls(vnorm)\n' + ' return super()._missing_(value)\n' + '\n' + ) + self._write( + 'def _as_dtype_num(dtype):\n' + ' import megengine._internal.mgb as m\n' + ' return m._get_dtype_num(dtype)\n' + '\n' + ) + self._write( + ''' +def _as_serialized_dtype(dtype): + import megengine._internal.mgb as m + return m._get_serialized_dtype(dtype) +''' + ) + self._process(defs) + self._write( + ''' +class SerializedDType(_ParamDefBase): + TAG = FakeSerializedDType.TAG + __slots__ = ['dtype'] + class IdentityPacker: + def pack(self, *args): + assert all([isinstance(x, bytes) for x in args]) + return b''.join(args) + _packer = IdentityPacker() + def __init__(self, dtype): + """ + :type dtype: :class:`np.dtype` compatible + """ + self.dtype = _as_serialized_dtype(dtype) +''' + ) + self._write('_enum_member2num = {\n %s}', + ',\n '.join(self._enum_member2num)) + + def _write_doc(self, doc): + assert isinstance(doc, member_defs.Doc) + if not doc.doc: + return + if doc.no_reformat: + self._write('"""') + for i in doc.raw_lines: + self._write(i) + self._write('"""') + return + + doc = doc.doc.replace('\n', ' ') + textwidth = 80 - len(self._cur_indent) + self._write('"""') + for i in textwrap.wrap(doc, textwidth): + self._write(i) + self._write('"""') + + + def _on_param_begin(self, p): + self._cur_param_name = str(p.name) + self._cur_fields = [] + self._cur_enum_names = [] + self._write('class %s(_ParamDefBase):', p.name, indent=1) + self._write_doc(p.name) + self._write('TAG = %d', p.tag) + + def _on_param_end(self, p): + # gen slots and packer + self._write('__slots__ = [%s]', ', '.join( + map('"{.name}"'.format, self._cur_fields))) + struct_fmt = ''.join(i.fmt for i in self._cur_fields) + if not struct_fmt: + struct_fmt = 'x' + else: + # add padding at end + max_t = max(struct_fmt, key=struct.calcsize) + struct_fmt += '0{}'.format(max_t) + self._write('_packer = struct.Struct("%s")', struct_fmt) + + # gen __init__ signature + self._write('def __init__(%s):', + ', '.join(['self'] + + list('{}={}'.format(i.name, i.default) + for i in self._cur_fields)), + indent=1) + # gen __init__ doc + self._write('"""') + for i in self._cur_fields: + self._write(':type {}: :class:`.{}`'.format(i.name, i.type)) + if i.doc: + self._write(':param {}: {}'.format(i.name, i.doc)) + self._write('"""') + + # gen cvt in __init__ + for i in self._cur_fields: + self._write('self.%s = %s', i.name, i.cvt) + + self._unindent() + self._unindent() + self._write('') + + def _on_member_enum(self, e): + qualname = '{}.{}'.format(self._cur_param_name, e.name) + + self._write('class %s(_EnumBase):', e.name, indent=1) + self._write_doc(e.name) + + for idx, emem in enumerate(e.members): + self._write('%s = "%s"', emem, emem) + self._write_doc(emem) + self._enum_member2num.append('id({}.{}):{}'.format( + qualname, emem, idx)) + + for emem, emem_alis in e.member_alias: + self._write('%s = %s', emem_alis, emem) + + self._unindent() + self._write('') + + self._cur_fields.append(self.FieldDef( + name=e.name_field, + cvt='{}.convert({})'.format(qualname, e.name_field), + fmt='I', + default="'{}'".format(e.members[e.default]), + type=qualname, + doc=None)) + + def _on_member_enum_alias(self, e): + self._write('%s = %s.%s', e.name, e.src_class, e.src_name) + s = e.src_enum + qualname = '{}.{}'.format(e.src_class, e.src_name) + self._cur_fields.append(self.FieldDef( + name=e.name_field, + cvt='{}.convert({})'.format(qualname, e.name_field), + fmt='I', + default="'{}'".format(s.members[e.get_default()]), + type=qualname, + doc=None)) + + def _get_py_default(self, cppdefault): + if not isinstance(cppdefault, str): + return cppdefault + + d = cppdefault + if d.endswith('f'): # 1.f + return d[:-1] + if d.endswith('ull'): + return d[:-3] + if d == 'false': + return 'False' + if d == 'true': + return 'True' + if d.startswith('DTypeEnum::'): + return '"{}"'.format(d.split(':')[2].lower()) + return d + + def _on_member_field(self, f): + d = self._get_py_default(f.default) + + self._cur_fields.append(self.FieldDef( + name=f.name, + cvt='{}({})'.format(f.dtype.pycvt, f.name), + fmt=f.dtype.pyfmt, + default=d, + type=f.dtype.pycvt, + doc=f.name.doc + )) + + def _on_const_field(self, f): + d = self._get_py_default(f.default) + self._write_doc(f.name) + self._write('%s = %s', f.name, d) + + + +class CPPWriter(IndentWriterBase): + _param_namespace = 'param' + + _ctor_args = None + """list of (text in func param, var name); func param name must be var name + appended by an underscore""" + _non_static_members = None + + def __call__(self, fout, defs): + super().__call__(fout) + self._write('// %s', self._get_header()) + self._write('#pragma once') + self._write('#include "megdnn/dtype.h"') + self._write('#include ') + if self._param_namespace == 'param': + self._write('#include ') + self._write('namespace megdnn {') + self._write('namespace %s {', self._param_namespace) + self._process(defs) + self._write('} // namespace megdnn') + self._write('} // namespace %s', self._param_namespace) + self._write('// vim: syntax=cpp.doxygen') + + def _write_doc(self, doc): + assert isinstance(doc, member_defs.Doc) + if not doc.doc: + return + + if doc.no_reformat: + self._write('/*') + for i in doc.raw_lines: + self._write('* ' + i) + self._write('*/') + return + + doc = doc.doc.replace('\n', ' ') + textwidth = 80 - len(self._cur_indent) - 4 + if len(doc) <= textwidth: + self._write('//! ' + doc) + return + + self._write('/*!') + for i in textwrap.wrap(doc, textwidth): + self._write(' * ' + i) + self._write(' */') + + def _on_param_begin(self, p): + self._write_doc(p.name) + self._write('struct %s {', p.name, indent=1) + self._write('static MEGDNN_CONSTEXPR uint32_t TAG = %du;', p.tag) + self._ctor_args = [] + self._non_static_members = [] + + def _add_ctor_args(self, typename, default, varname): + self._ctor_args.append(( + '{} {}_={}'.format(typename, varname, default), + varname)) + + def _on_param_end(self, p): + ''' + MegDNN param structures are not packed and we need to initialize the structure + paddings to zero or it would break MegBrain hash system. We do memset(0) in default + ctor and use a trick, wrapping non-static members in a anonymous union which would + copy the object representation in its default copy/move ctor, for copy/move ctor. + > The implicitly-defined copy/move constructor for a non-union class X performs + > a memberwise copy/move of its bases and members. [class.copy.ctor 14] + > The implicitly-defined copy/move constructor for a union X copies the object + > representation (6.9) of X. [class.copy.ctor 15] + ''' + if self._non_static_members: + self._write('union { struct {') + for i in self._non_static_members: + if isinstance(i, member_defs.Field): + self._write_doc(i.name) + self._write('%s%s %s;', i.dtype.cname_attr, i.dtype.cname, i.name) + else: + assert isinstance(i, (member_defs.Enum, member_defs.EnumAlias)) + self._write('%s %s;', i.name, i.name_field) + self._write('}; };') + if self._ctor_args: + pdefs, varnames = zip(*self._ctor_args) + self._write('%s(%s) {', p.name, ', '.join(pdefs), indent=1) + self._write('memset(this, 0, sizeof(*this));') + for var in varnames: + self._write('this->%s = %s_;', var, var) + self._write('}', indent=-1) + self._write('};\n', indent=-1) + + def _on_member_enum(self, e): + self._write_doc(e.name) + self._write('enum class %s: uint32_t {', e.name, indent=1) + for idx, i in enumerate(e.members): + self._write_doc(i) + v = '{} = {}'.format(i, idx) + if i is not e.members[-1] or e.member_alias: + v += ',' + self._write(v) + for mem, alias in e.member_alias: + self._write('%s = %s,', alias, mem) + self._write('};', indent=-1) + self._non_static_members.append(e) + self._write('static MEGDNN_CONSTEXPR uint32_t %s_NR_MEMBER = %d;', + str(e.name).upper(), len(e.members)) + self._add_ctor_args(e.name, + '{}::{}'.format(e.name, e.members[e.default]), + e.name_field) + + def _on_member_enum_alias(self, e): + s = e.src_enum + self._write('using %s = %s::%s;', e.name, e.src_class, e.src_name) + self._non_static_members.append(e) + self._write('static MEGDNN_CONSTEXPR uint32_t %s_NR_MEMBER = %d;', + str(e.name).upper(), len(s.members)) + self._add_ctor_args(e.name, + '{}::{}'.format(e.name, + s.members[e.get_default()]), + e.name_field) + + def _on_member_field(self, f): + self._non_static_members.append(f) + self._add_ctor_args(f.dtype.cname, f.default, f.name) + + def _on_const_field(self, f): + self._write_doc(f.name) + if 'int' in f.dtype.cname: + self._write('static constexpr %s%s %s = %s;', f.dtype.cname_attr, f.dtype.cname, f.name, f.default) + else: + self._write('static const %s%s %s = %s;', f.dtype.cname_attr, f.dtype.cname, f.name, f.default) + + + +class CPPEnumValueWriter(CPPWriter): + _param_namespace = 'param_enumv' + + def _on_member_enum(self, e): + self._write_doc(e.name) + self._write('struct %s {', e.name, indent=1) + for idx, val in enumerate(e.members): + self._write_doc(val) + self._write('static const uint32_t %s = %d;', val, idx) + for mem, alias in e.member_alias: + self._write('static const uint32_t %s = %s;', alias, mem) + self._write('};', indent=-1) + + + def _on_member_enum_alias(self, e): + s = e.src_enum + self._write('typedef %s::%s %s;', e.src_class, e.src_name, e.name) + + def _on_member_field(self, f): + pass + + def _on_const_field(self, f): + pass + + +class CPPEnumItemWriter(WriterBase): + _class_name = None + _enum_name = None + _enable = False + + def __init__(self, enum_def): + self._class_name, self._enum_name = enum_def.split(':') + + def __call__(self, fout, defs): + super().__call__(fout) + self._process(defs) + + def _on_param_begin(self, p): + self._enable = p.name == self._class_name + + def _on_member_enum(self, e): + if self._enable and e.name == self._enum_name: + for i in e.members: + self._fout.write('{}\n'.format(i)) + +class CPPParamJsonFuncWriter(IndentWriterBase): + _param_namespace = 'param' + _param_name = None + _items = None + def _write_json_item(self, json_cls, field): + cls2ctype = { + 'NumberInt': 'int64_t', + 'Number': 'double', + 'Bool': 'bool', + } + self._items.append('{"%s", json::%s::make(static_cast<%s>(p.%s))},' % ( + field, json_cls, cls2ctype[json_cls], field)) + + + def __call__(self, fout, defs): + super().__call__(fout) + self._write('// %s', self._get_header()) + self._write('// this file can only be included in ' + 'megbrain/src/plugin/impl/opr_footprint.cpp\n' + '// please do not include it directly') + self._write('#include "megdnn/opr_param_defs.h"') + self._write('#pragma once') + self._write('using namespace megdnn;') + self._write('namespace mgb {') + self._write('namespace opr {') + self._write('template') + self._write('std::shared_ptr opr_param_to_json(const OprParam ¶m);') + self._process(defs) + self._write('} // namespace opr') + self._write('} // namespace mgb') + self._write('\n// vim: syntax=cpp.doxygen') + + def _on_param_begin(self, p): + self._write('template<>', indent=0) + self._write( + 'std::shared_ptr opr_param_to_json(const param::%s &p) {', + p.name, indent=1) + self._param_name = 'param::{}'.format(p.name) + self._items = [] + + def _on_param_end(self, p): + self._write('return json::Object::make({', indent=1) + for i in self._items: + self._write(i, indent=0) + self._write('});', indent=-1) + self._write('}', indent=-1) + + def _on_member_enum(self, e): + self._write('auto %s2str = [](const %s::%s arg) -> std::string {', + e.name, self._param_name, e.name, indent=1) + self._write('switch (arg) {', indent=1) + enum2str = [] + if isinstance(e, member_defs.EnumAlias): + members = e.src_enum.members + else: + members = e.members + for idx, i in enumerate(members): + self._write('case %s::%s::%s: return "%s";', + self._param_name, e.name, i, i, indent=0) + self._write('default: mgb_throw(MegBrainError, "Invalid %s::%s:%%d", static_cast(arg));', + self._param_name, e.name, indent=0) + self._write('}', indent=-1) + self._write('};', indent=-1) + self._items.append('{"%s", json::String::make(%s2str(p.%s))},' % ( + e.name_field, e.name, e.name_field)) + + def _on_member_enum_alias(self, e): + self._on_member_enum(e) + + def _on_member_field(self, f): + self._write_json_item(f.dtype.cppjson, f.name) + + def _on_const_field(self, f): + pass + + +def main(): + parser = argparse.ArgumentParser( + 'generate opr param defs from description file') + parser.add_argument('--enumv', action='store_true', + help='generate c++03 compatible code which only ' + 'contains enum values') + parser.add_argument('-t', '--type', choices=['c++', 'py'], default='c++', + help='output type') + parser.add_argument('--write-enum-items', + help='write enum item names to output file; argument ' + 'should be given in the CLASS:ENUM format') + parser.add_argument('--write-cppjson', + help='generate megbrain json serialization implemention' + 'cpp file') + parser.add_argument('input') + parser.add_argument('output') + args = parser.parse_args() + + with open(args.input) as fin: + inputs = fin.read() + exec(inputs, {'pdef': ParamDef, 'Doc': member_defs.Doc}) + input_hash = hashlib.sha256() + input_hash.update(inputs.encode(encoding='UTF-8')) + input_hash = input_hash.hexdigest() + + if args.type == 'py': + writer = PyWriter() + else: + assert args.type == 'c++' + if args.enumv: + writer = CPPEnumValueWriter() + elif args.write_enum_items: + writer = CPPEnumItemWriter(args.write_enum_items) + else: + writer = CPPWriter() + with open(args.output, 'w') as fout: + writer.set_input_hash(input_hash)(fout, ParamDef.all_param_defs) + + if args.write_cppjson: + writer = CPPParamJsonFuncWriter() + with open(args.write_cppjson, 'w') as fout: + writer.set_input_hash(input_hash)(fout, ParamDef.all_param_defs) + +if __name__ == '__main__': + main() diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py new file mode 100644 index 00000000..fa5a90a4 --- /dev/null +++ b/dnn/scripts/opr_param_defs.py @@ -0,0 +1,919 @@ +pdef('Empty') + +pdef('Axis').add_fields('int32', 'axis', 0) + +(pdef('Convolution', version=0, is_legacy=True). + add_enum('Mode', 'CROSS_CORRELATION', 'CONVOLUTION'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1 + ). + add_enum('DataType', + Doc('FLOAT', 'input/output both float32/float16'), + 'INT8x8x16', + 'INT8x8x32', + Doc('FLOAT_IO16xC32', 'input/output both float16, the internal ' + 'compute is float32'), + Doc('QUINT8x8x32', 'input QuantizedAsymm8, output QuantizedS32'), + Doc('INT8x8xX', 'input int8, output specified by tensor DType'), + Doc('QUINT4x4x32', 'input QuantizedAsymm4, output QuantizedS32'), + name_field='data_type'). + add_enum('Sparse', + Doc('DENSE', 'dense convolution: filter shape should be ' + '[oc, ic, spatial...] if format is NCHW, ' + '[oc, spatial..., ic] if format is NHWC'), + Doc('GROUP', 'group convolution: filter shape should be ' + '[group, oc_per_group, ic_per_group, spatial...] if format is NCHW, ' + '[group, oc_per_group, spatial..., ic_per_group] if format is NHWC') + ). + add_enum(Doc('Format', 'convolution data/filter/output format; see ' + ':class:`RelayoutFormat` for more details'), + 'NCHW', 'NHWC', 'NHWCD4', 'NCHW4', 'NCHW8', 'NCHW32', 'NCHW88', + Doc('NCHW_WINOGRAD', 'NCHW layout with weights tranformed by winograd'), + Doc('NCHW88_WINOGRAD', 'NCHW88 layout with weights tranformed by winograd'), + Doc('CHWN4', 'CHWN4 is currently only used on Nvidia platform for fast implementation ' + 'of convolution using CUDA/SASS. The channels are splitted to groups of 4 channels.')) + ) + +(pdef('Convolution', version=1). + add_enum_alias('Mode', 'ConvolutionV0'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1 + ). + add_enum_alias('Sparse', 'ConvolutionV0'). + add_enum_alias('Format', 'ConvolutionV0'). + add_enum(Doc('ComputeMode', 'Specifies special computation modes, e.g. ' + 'different combinations of intermediate result ' + 'data types.'), + Doc('DEFAULT', 'No special requirements on the precision of ' + 'intermediate results.'), + Doc('FLOAT32', 'Use Float32 accumulator and intermediate result. ' + 'Only supported when input and output is Float16.'), + name_field='compute_mode') + ) + +(pdef('MaskPropagate'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('kernel_h', 'kernel height'), 1, + Doc('kernel_w', 'kernel width'), 1, + Doc('dilate_h', 'dilate height'), 1, + Doc('dilate_w', 'dilate width'), 1) + ) + +(pdef('ConvPooling'). + add_enum('Method', 'WITH_TEXTURE_OBJ', 'WITH_SHARED_MEM'). + add_enum_alias('ConvMode', 'ConvolutionV0', 'Mode'). + add_enum('PoolMode', 'AVERAGE', 'MAX'). + add_enum('NonlineMode', 'IDENTITY', 'RELU', 'SIGMOID'). + add_fields('uint32', 'pool_shape_h', 1, 'pool_shape_w', 1, 'pool_stride_h', 1, 'pool_stride_w', 1, \ + 'pool_pad_h', 0, 'pool_pad_w', 0, 'conv_stride_h', 1, 'conv_stride_w', 1, 'conv_pad_h', 0, 'conv_pad_w', 0)) + +(pdef('ConvBias', 'legacy conv_bias', version=0, is_legacy=True). + add_enum('NonlineMode', 'IDENTITY', 'RELU', 'SIGMOID', 'H_SWISH'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1)) + +(pdef('ConvBias', 'active(conv(x, w) + bias)', version=1, is_legacy=True). + add_enum_alias('NonlineMode', 'ConvBiasV0'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_enum_alias('DataType', 'ConvolutionV0', name_field='data_type'). + add_enum_alias('Sparse', 'ConvolutionV0'). + add_enum_alias('Format', 'ConvolutionV0'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1) + ) + +(pdef('ConvBias', 'active(conv(x, w) + bias)', version=2, is_legacy=True). + add_enum_alias('NonlineMode', 'ConvBiasV0'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_enum_alias('Sparse', 'ConvolutionV0'). + add_enum_alias('Format', 'ConvolutionV0'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1). + add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode') + ) + +(pdef('ConvBias', 'active(conv(x, w) + bias)', version=3). + add_enum_alias('NonlineMode', 'ConvBiasV0'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_enum_alias('Sparse', 'ConvolutionV0'). + add_enum_alias('Format', 'ConvolutionV0'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('output_block_size', 'detail meaning \see winograd in conv bias'), 0). + add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode') + ) + +(pdef('SeparableConv'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_enum('BorderMode', 'BORDER_REPLICATE', 'BORDER_REFLECT', + 'BORDER_REFLECT_101','BORDER_WRAP', + 'BORDER_CONSTANT', 'BORDER_TRANSPARENT','BORDER_ISOLATED'). + add_fields('bool', 'is_symm_kernel', 'true'). + add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1, + 'ksize_h', 3, 'ksize_w', 3, 'anchor_h', 1, 'anchor_w', 1)) + +(pdef('Images2Neibs'). + add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1, + 'window_h', 3, 'window_w', 3)) + +(pdef('Pooling'). + add_enum( + 'Mode', + Doc('MAX', 'maximum value inside pooling window'), + Doc('AVERAGE', + 'arithmetic mean of all values inside pooling window. Padding values ' + 'are taken into account and are viewed as zero'), + Doc('AVERAGE_COUNT_EXCLUDE_PADDING', + 'arithmetic mean of all values inside pooling window. No padding is' + 'used.') + ). + add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 2, 'stride_w', 2, + 'window_h', 2, 'window_w', 2). + add_enum_alias('Format', 'ConvolutionV0') + ) + +(pdef('LRN', + 'see ImageNet Classification with Deep Convolutional Neural Networks for' + ' meaning of the fields'). + add_fields('uint32', Doc('n', 'must be odd'), 5). + add_fields('float32', 'k', '2.f', 'alpha', '1e-4f', 'beta', '0.75f') +) + +(pdef('BN'). + add_enum( + 'ParamDim', + Doc('DIM_11HW', 'Dim of params (Sigma, Mu) is 1 x 1 x H x W'), + Doc('DIM_1CHW', 'Dim of params (Sigma, Mu) is 1 x C x H x W'), + Doc('DIM_1C11', 'Dim of params (Sigma, Mu) is 1 x C x 1 x 1'), + name_field='param_dim' + ). + add_enum( + 'FwdMode', + Doc('TRAINING', 'Training phase.'), + Doc('INFERENCE', 'Inference phase.'), + name_field='fwd_mode' + ). + add_fields('float64', 'epsilon', '1e-4f'). + add_fields('float64', 'avg_factor', '1.f'). + add_fields('float32', 'scale', '1.f'). + add_fields('float32', 'bias', '0.f') +) + +(pdef('ROIPooling'). + add_enum( + 'Mode', + Doc('MAX', 'maximum value inside pooling window; pooling result would ' + 'be 0 if pooling window is empty'), + Doc('AVERAGE', + 'arithmetic mean of all values inside pooling window; pooling result ' + 'would be 0 if pooling window is empty') + ). + add_fields('float32', 'scale', '1.f')) + +INTERP_MODES = ['NEAREST', 'LINEAR', 'AREA', 'CUBIC', 'LANCZOS4'] +BORDER_MODES = [Doc('REPLICATE', 'aaaaaa|abcdefgh|hhhhhhh'), + Doc('REFLECT', 'fedcba|abcdefgh|hgfedcb'), + Doc('REFLECT_101', 'gfedcb|abcdefgh|gfedcba'), + Doc('WRAP', 'cdefgh|abcdefgh|abcdefg'), + Doc('CONSTANT', 'iiiiii|abcdefgh|iiiiiii'), + Doc('TRANSPARENT', ''), + Doc('ISOLATED', '')] +(pdef('WarpPerspective', version=1). + add_enum('InterpolationMode', *INTERP_MODES, + name_field='imode', default=1, + member_alias=[(i, 'INTER_{}'.format(i)) for i in INTERP_MODES] + ). + add_enum('BorderMode', *BORDER_MODES, + name_field='bmode', + member_alias=[(i, 'BORDER_{}'.format(i)) for i in BORDER_MODES] + ). + add_enum_alias('Format', 'ConvolutionV0'). + add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f')) + +pdef('SpatialTfGridGenerator').add_enum('Mode', 'AFFINE') +pdef('SpatialTfSampler').add_enum('Mode', 'BILINEAR') + +pdef('AddUpdate').add_fields( + 'float32', 'alpha', '1.f', 'beta', '1.f', 'bias', '0.f') + +pdef('Elemwise').add_enum( + 'Mode', + Doc('RELU', 'unary: max(x, 0)'), + Doc('ABS', 'unary: abs(x)'), + Doc('ACOS', 'unary: acos(x)'), + Doc('ASIN', 'unary: asin(x)'), + Doc('CEIL', 'unary: ceil(x)'), + Doc('COS', 'unary: cos(x)'), + Doc('EXP', 'unary: exp(x)'), + Doc('EXPM1', 'unary: numerically stable exp(x)-1'), + Doc('FLOOR', 'unary: floor(x)'), + Doc('LOG', 'unary: natural logarithm, log(x)'), + Doc('LOG1P', 'unary: numerically stable log(x+1)'), + Doc('NEGATE', 'unary: -x'), + Doc('SIGMOID', 'unary: 1/(1+exp(-x))'), + Doc('SIN', 'unary: sin(x)'), + Doc('TANH', 'unary: tanh(x)'), + + Doc('ABS_GRAD', 'binary: x > 0 ? y : -y'), + Doc('ADD', 'binary: x + y'), + Doc('FLOOR_DIV', 'binary: floor(x / y)'), + Doc('MAX', 'binary: max(x, y)'), + Doc('MIN', 'binary: min(x, y)'), + Doc('MOD', 'binary: x % y or fmodf(x, y)'), + Doc('MUL', 'binary: x * y'), + Doc('POW', 'binary: pow(x, y)'), + Doc('SIGMOID_GRAD', 'binary: x * (1 - x) * y'), + Doc('SUB', 'binary: x - y'), + Doc('SWITCH_GT0', 'binary: (x > 0) * y'), + Doc('TANH_GRAD', 'binary: (1 - x * x) * y'), + Doc('TRUE_DIV', 'binary: x / y'), + Doc('LOG_SUM_EXP', 'binary: numerically stable log(exp(x) + exp(y))'), + + Doc('LT', 'binary: x < y'), + Doc('LEQ', 'binary: x <= y'), + Doc('EQ', 'binary: x == y'), + + Doc('SHL', 'bitwise binary: x << y. ' + 'Note that result is undefined if y < 0 or y >= bitwidth. Logical ' + 'shift is performed for unsigned intergers, and arithmetic shift for ' + 'signed ones.'), + Doc('SHR', 'bitwise binary: x >> y; see SHL mode for more details'), + + Doc('COND_LEQ_MOV', 'ternary: x <= y ? z : 0'), + Doc('FUSE_MUL_ADD3', + 'compute ``a * b + c`` where c must either have same layout as ' + 'a or b, or be a scalar'), + + Doc('FUSE_MUL_ADD4', + 'compute ``a * A + b * B`` where a and b must have equal layout, ' + 'and A and B must have equal layout. In the inputs ``b`` and ``B`` ' + 'can be swapped'), + + Doc('FUSE_ADD_RELU', 'binary: max(x+y, 0)'), + Doc('FUSE_ADD_SIGMOID', 'binary: 1/(1+exp(-(x+y)))'), + Doc('FUSE_ADD_TANH', 'binary: tanh(x+y)'), + Doc('FAST_TANH', 'unary: rational approximation of tanh(x)'), + Doc('FAST_TANH_GRAD', 'binary: grad of the rational approximation of tanh(x)'), + + Doc('ROUND', 'unary: round(x), the nearest integer value to x, rounding ' + 'halfway cases away from zero. Float only.'), + Doc('RMULH', 'binary: rounded higher l bits of x * y, where l is the bit ' + 'length of x.'), + + Doc('ATAN2','binary: atan2(y,x)'), + Doc('ERF', 'unary: erf(x)'), + Doc('ERFINV', 'unary: inverse function of erf(x)'), + Doc('ERFC', 'unary: erfc(x)'), + Doc('ERFCINV', 'unary: inverse function of erfc(x)'), + Doc('H_SWISH', 'unary: x * clip(x + 3, 0, 6) / 6'), + Doc('H_SWISH_GRAD', 'binary: x < -3 ? 0 : (x > 3 ? y : (2 * x + 3) / 6 * y)'), + Doc('FUSE_ADD_H_SWISH', 'binary: hswish(x+y)') +) + +pdef('ElemwiseMultiType').add_enum( + 'Mode', + Doc('FUSE_MUL_ADD3_INT16x32x32x32', + 'compute ``a * b + c`` requiring that ``a`` be int16 and ``b`` and ' + '``c`` int32, and the result is int32. This mode is optimized for ' + 'the channel-broadacsted case, i.e. ``a`` has shape (A, B, C) and ' + '``b`` and ``c`` have shape (1, C, 1)'), + Doc('FUSE_MUL_ADD3_IXxF32xF32xI8', + 'compuate ``a * b + c`` where the inputs ``a`` is an integer type ' + '``b`` and ``c`` are both ``float32``, the result is ' + '``int8``. This is currently only optimized for ``(1, x)`` ' + 'broadcast for ``b`` and ``c``. Computation is carried in floating ' + 'points and results are rounded towards zero with saturated cast to ' + 'int.'), + Doc('ROUND_SHR_SATURATE_IXxI8xI8', + 'Compute ``a >> b``, round the result according to lower ``b`` bits ' + 'of ``a``` and make a saturating conversion to int8. Where ``a`` should' + ' be an integer tensor and ``b`` should be an int8 scalar.'), + Doc('FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8', + 'Fused operation of an int16 elemwise add, an int16 rounding multiply ' + 'high and an int16 to int8 rounding right shift with saturation.'), + Doc('FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8', + 'Fused operation of an int32 elemwise add, an int32 rounding multiply ' + 'high and an int32 to int8 rounding right shift with saturation.'), + Doc('ROUND_SHR_SATURATE_IXxI8xI16', + 'Compute ``a >> b``, round the result according to lower ``b`` bits of ' + '``a``` and make a saturating conversion to int16. Where ``a`` should' + ' be an integer tensor and ``b`` should be an int8 scalar.'), + Doc('QADD', 'Fused elemwise add two quantized int8 with specified' + 'output quantized dtype'), + Doc('QFUSE_ADD_RELU', 'Fused elemwise add two quantized int8 followed' + ' by ReLU and typecvt to specified dtype'), + Doc('QMUL', 'Fused elemwise multiply two quantized int8 with specified' + 'output quantized dtype'), + Doc('QMIN', 'Fused elemwise min two quantized int8 with specified' + 'output quantized dtype'), + Doc('QMAX', 'quantized: max(x, y), with specified output quantized dtype'), + Doc('QSUB', 'quantized: x - y'), + Doc('QTRUE_DIV', 'quantized: x / y'), + Doc('QFUSE_ADD_SIGMOID', 'quantized: sigmoid(x + y)'), + Doc('QFUSE_ADD_TANH', 'quantized: tanh(x + y)'), + Doc('QRELU', 'quantized: x > 0 ? x : 0'), + Doc('QABS', 'quantized: x > 0 ? x : -x'), + Doc('QSIGMOID', 'quantized: sigmoid(x)'), + Doc('QEXP', 'quantized: exp(x)'), + Doc('QTANH', 'quantized: tanh(x)'), + Doc('QFUSE_MUL_ADD3', 'quantized: x * y + z'), + Doc('QFAST_TANH', 'quantized: fast_tanh(x)'), + Doc('QNEGATE', 'quantized: -x'), + Doc('QACOS', 'quantized: acos(x)'), + Doc('QASIN', 'quantized: asin(x)'), + Doc('QCEIL', 'quantized: ceil(x)'), + Doc('QCOS', 'quantized: cos(x)'), + Doc('QEXPM1', 'quantized: expm1(x)'), + Doc('QFLOOR', 'quantized: floor(x)'), + Doc('QLOG', 'quantized: log(x)'), + Doc('QLOG1P', 'quantized: log1p(x)'), + Doc('QSIN', 'quantized: sin(x)'), + Doc('QROUND', 'quantized: round(x)'), + Doc('QERF', 'quantized: erf(x)'), + Doc('QERFINV', 'quantized: erfinv(x)'), + Doc('QERFC', 'quantized: erfc(x)'), + Doc('QERFCINV', 'quantized: erfcinv(x)'), + Doc('QABS_GRAD', 'quantized: abs_grad'), + Doc('QFLOOR_DIV', 'quantized floor_div'), + Doc('QMOD', 'quantized mod'), + Doc('QSIGMOID_GRAD', 'quantized sigmoid_grad'), + Doc('QSWITCH_GT0', 'quantized switch_gt0'), + Doc('QTANH_GRAD', 'quantized tanh_grad'), + Doc('QLT', 'quantized lt'), + Doc('QLEQ', 'quantized leq'), + Doc('QEQ', 'quantized eq'), + Doc('QPOW', 'quantized pow'), + Doc('QLOG_SUM_EXP', 'quantized log_sum_exp'), + Doc('QFAST_TANH_GRAD', 'quantized fast_tanh_grad'), + Doc('QATAN2', 'quantized atan2'), + Doc('QCOND_LEQ_MOV', 'quantized cond_leq_mov'), + Doc('QH_SWISH', 'quantized h_swish'), + Doc('QFUSE_ADD_H_SWISH', 'quantized h_swish(x+y)'), + Doc('QH_SWISH_GRAD', 'quantized h_swish_grad') +) + +pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0) + +(pdef('MatrixMul', version=0, is_legacy=True). + add_fields('bool', 'transposeA', 'false', 'transposeB', 'false'). + add_enum('DataType', + Doc('FLOAT', 'input/output both float32/float16'), + 'INT8x8x16', + 'INT8x8x32', + Doc('FLOAT_IO16xC32', 'input/output both float16, the internal compute is ' + 'float32'), + Doc('QUINT8x8x32', 'input QuantizedAsymm8, output QuantizedS32'), + Doc('QUINT4x4x32', 'input QuantizedAsymm4, output QuantizedS32'), + name_field='data_type')) + +(pdef('MatrixMul', version=1, is_legacy=True). + add_fields('bool', 'transposeA', 'false', 'transposeB', 'false'). + add_enum(Doc('ComputeMode', 'Specifies special computation modes, e.g. ' + 'different combinations of intermediate result ' + 'data types.'), + Doc('DEFAULT', 'No special requirements on the precision of ' + 'intermediate results.'), + Doc('FLOAT32', 'Use Float32 accumulator and intermediate result. ' + 'Only supported when input and output is Float16.'), + name_field='compute_mode')) + +(pdef('MatrixMul', version=2). + add_fields('bool', 'transposeA', 'false', 'transposeB', 'false'). + add_enum_alias('ComputeMode', 'MatrixMulV1', name_field='compute_mode'). + add_enum('Format', + Doc('DEFAULT', 'Normal matrix mul: (M, K) x (K, N) = (M, N)'), + Doc('MK4', 'Split 4 from M and K, better for neon compute:' + '(M/4, K/4, 4(k), 4(m)) x (K/4, N, 4(k)). if transposeA the ' + 'layout is (K/4, M/4, 4(k), 4(m)) x (K/4, N, 4(k))'), + Doc('MK8', 'Split 8 from M and K, better for neon compute:' + '(M/8, K/8, 8(k), 8(m)) x (K/8, N, 8(k)). if transposeA the ' + 'layout is (K/8, M/8, 8(k), 8(m)) x (K/8, N, 8(k))')) + ) + +(pdef('Winograd', 'winograd param used in convbias'). + add_fields( + 'uint32', + Doc('output_block_size', 'output block size, detail meaning see winograd ' + 'in convbias, equals to the meaning of m in F(m, r)'), 0). + add_enum_alias('Format', 'MatrixMul') + ) + +(pdef('SVD'). + add_fields('bool', + Doc('full_matrices', + 'Whether to compute the full-sized u and v or only the leading' + ' min(m, n) singular vectors. Ignored if compute_uv is ' + 'false.'), + 'false', + Doc('compute_uv', + 'Whether the left (u) and right (v) singular vectors will be ' + 'computed and outputted.'), + 'true')) + +(pdef('Reduce', 'legacy reduce', version=0, is_legacy=True). + add_enum('Mode', + 'SUM', + Doc('SUM_SQR', 'sum of x * x for each element x'), + 'PRODUCT', 'MIN', 'MAX'). + add_fields('int32', + Doc('axis', + 'axis along which reduction is performed; if -1 is given, ' + 'reduce to given target shape (only used in megbrain)'), + -1)) + +(pdef('Reduce', 'reduce along given axis', version=1, is_legacy=True). + add_enum('Mode', + 'SUM', + Doc('SUM_SQR', 'sum of x * x for each element x'), + 'PRODUCT', 'MIN', 'MAX', 'MEAN'). + add_fields('int32', + Doc('axis', + 'axis along which reduction is performed; if -1 is given, ' + 'reduce to given target shape (only used in megbrain)'), + -1). + add_enum('DataType', + Doc('DEFAULT', +''' +input/output are the same data type, and the internal computation type would be chosen by the input/output dtypes and the reduction mode. +Currently, ```DEFAULT``` mode means: + ++--------------------+-----------------------------------+-------------------+ +| Input/Output DType | Mode | Computation DType | ++====================+===================================+===================+ +| FLOAT32 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | FLOAT32 | ++--------------------+-----------------------------------+-------------------+ +| FLOAT16 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | FLOAT16 | ++--------------------+-----------------------------------+-------------------+ +| INT32 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | INT32 | ++--------------------+-----------------------------------+-------------------+ +| INT8 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | INT8 | ++--------------------+-----------------------------------+-------------------+ +| QuantizedS8 | MIN/MAX | QuantizedS8 | ++--------------------+-----------------------------------+-------------------+ +| QuantizedS8 | MEAN/SUM | QuantizedS32 | ++--------------------+-----------------------------------+-------------------+ +| Quantized8Asymm | MIN/MAX | Quantized8Asymm | ++--------------------+-----------------------------------+-------------------+ +| Quantized8Asymm | MEAN/SUM | QuantizedS32 | ++--------------------+-----------------------------------+-------------------+ + +''' +), + Doc('FLOAT_IO16xC32', 'Deprecated. This was replaced by ' + 'FLOAT_O16xC32, and input\'s dtype decided by actual input tensor.'), + Doc('FLOAT_O32xC32', 'compute/output both are float32'), + Doc('FLOAT_O16xC32', 'compute are float32, output float16'), + Doc('QUINT_I8xO32', 'input quint8, compute and output are qint32'), + Doc('QINT_I8xO32', 'input qint8, compute and output are qint32'), + name_field='data_type')) + +(pdef('Reduce', 'reduce along given axis', version=2). + add_enum('Mode', + 'SUM', + Doc('SUM_SQR', 'sum of x * x for each element x'), + 'PRODUCT', 'MIN', 'MAX', 'MEAN'). + add_fields('int32', + Doc('axis', + 'axis along which reduction is performed; if INT_MAX is given, ' + 'reduce to given target shape (only used in megbrain)'), + (1<<31)-1). + add_enum('DataType', + Doc('DEFAULT', +''' +input/output are the same data type, and the internal computation type would be chosen by the input/output dtypes and the reduction mode. +Currently, ```DEFAULT``` mode means: + ++--------------------+-----------------------------------+-------------------+ +| Input/Output DType | Mode | Computation DType | ++====================+===================================+===================+ +| FLOAT32 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | FLOAT32 | ++--------------------+-----------------------------------+-------------------+ +| FLOAT16 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | FLOAT16 | ++--------------------+-----------------------------------+-------------------+ +| INT32 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | INT32 | ++--------------------+-----------------------------------+-------------------+ +| INT8 | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT | INT8 | ++--------------------+-----------------------------------+-------------------+ +| QuantizedS8 | MIN/MAX | QuantizedS8 | ++--------------------+-----------------------------------+-------------------+ +| QuantizedS8 | MEAN/SUM | QuantizedS32 | ++--------------------+-----------------------------------+-------------------+ +| Quantized8Asymm | MIN/MAX | Quantized8Asymm | ++--------------------+-----------------------------------+-------------------+ +| Quantized8Asymm | MEAN/SUM | QuantizedS32 | ++--------------------+-----------------------------------+-------------------+ + +''' +), + Doc('FLOAT_IO16xC32', 'Deprecated. This was replaced by ' + 'FLOAT_O16xC32, and input\'s dtype decided by actual input tensor.'), + Doc('FLOAT_O32xC32', 'compute/output both are float32'), + Doc('FLOAT_O16xC32', 'compute are float32, output float16'), + Doc('QUINT_I8xO32', 'input quint8, compute and output are qint32'), + Doc('QINT_I8xO32', 'input qint8, compute and output are qint32'), + name_field='data_type')) + +(pdef('Cumsum', 'calculate accumulated sum along given axis', version=0, is_legacy=True). + add_fields('int32', + Doc('axis', + 'axis along which cumsum is performed'), + -1). + add_fields('bool', + Doc('exclusive', + 'whether the current element is taken into account'), + 'true'). + add_fields('bool', + Doc('reverse', + 'whether the cumsum is forward or backward'), + 'false')) + +(pdef('Cumsum', 'calculate accumulated sum along given axis', version=1). + add_fields('int32', + Doc('axis', + 'axis along which cumsum is performed, default with INT_MAX'), + (1<<31)-1). + add_fields('bool', + Doc('exclusive', + 'whether the current element is taken into account'), + 'true'). + add_fields('bool', + Doc('reverse', + 'whether the cumsum is forward or backward'), + 'false')) + +(pdef('CondTake'). + add_enum('Mode', + Doc('EQ', 'take if ``abs(data-val)=eps``'), + Doc('LT', 'take if ``dataval``'), + Doc('GEQ', 'take if ``data>=val``')). + add_fields('float32', + Doc('val', 'the value to be compared with; note that for integer ' + 'data, val is also converted to int'), 0). + add_fields('float32', Doc('eps', 'used for float equality comparison'), + 1e-6)) + + +pdef('Argsort').add_enum('Order', 'ASCENDING', 'DESCENDING') + +(pdef('IndexingRemap'). + add_fields('bool', + Doc('is_non_overlapping', + 'Whether no two dst element maps to the same src element. ' + 'Enabling this option can accelerate gradient operator since' + ' atomic adding operations could be avoided.'), + 'false')) + +pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) + +(pdef('Linspace'). + add_fields('bool', + Doc('endpoint', + 'Whether stop is included in the generated tensor'), + 'true')) + +(pdef('LinspaceFull'). + add_fields('float64', + Doc('start', 'The first val.'), + 0). + add_fields('float64', + Doc('stop', 'The last val.'), + 1). + add_fields('bool', + Doc('endpoint', + 'Whether stop is included in the generated tensor'), + 'true')) + +(pdef('Eye'). + add_fields( + 'int32', + Doc('k', 'Index of the diagonal: 0 (the default) refers to the main ' + 'diagonal, a positive value refers to an upper diagonal, and a ' + 'negative value to a lower diagonal.'), + 0). + add_fields( + 'dtype', Doc('dtype', 'data type of output value'), + 'DTypeEnum::Float32')) + +pdef('UniformRNG').add_fields('uint64', 'seed', 0) + +(pdef('GaussianRNG'). + add_fields('uint64', 'seed', 0). + add_fields('float32', 'mean', 0, 'std', 1)) + +(pdef('Flip'). + add_fields('bool', 'vertical', 'false', 'horizontal', 'false')) + +(pdef('Rotate') + .add_fields('bool', 'clockwise', 'true')) + +(pdef('ROICopy') + .add_fields('uint32', 'row_from', 0, 'row_to', 0, 'col_from', 0, 'col_to', 0)) + +(pdef('CvtColor') + .add_enum('Mode', 'RGB2GRAY', 'RGB2YUV', 'YUV2RGB', 'GRAY2RGB', 'RGBA2RGB', + 'RGBA2BGR', 'RGBA2GRAY', 'RGB2BGR', 'BGR2GRAY', 'BGR2RGB', + Doc('YUV2GRAY_NV21', 'For historical reasons, referred to as YCC by opencv'), + 'YUV2RGB_NV21', 'YUV2BGR_NV21', 'YUV2GRAY_NV12', 'YUV2RGB_NV12', + 'YUV2BGR_NV12', 'YUV2GRAY_YV12', 'YUV2RGB_YV12', 'YUV2BGR_YV12', + 'YUV2GRAY_YU12', 'YUV2RGB_YU12', 'YUV2BGR_YU12', + 'YCrCb2RGB', 'YCrCb2BGR', + Doc('BT601_YUV2RGB_NV21', 'BT601 yuv format, referred to as YUV by opencv'), + 'BT601_YUV2BGR_NV21', 'BT601_YUV2RGB_NV12', 'BT601_YUV2BGR_NV12', + 'BT601_YUV2RGB_YV12', 'BT601_YUV2BGR_YV12', 'BT601_YUV2RGB_YU12', + 'BT601_YUV2BGR_YU12', + member_alias=[('YUV2GRAY_NV21', 'BT601_YUV2GRAY_NV21'), + ('YUV2GRAY_NV12', 'BT601_YUV2GRAY_NV12'), + ('YUV2GRAY_YV12', 'BT601_YUV2GRAY_YV12'), + ('YUV2GRAY_YU12', 'BT601_YUV2GRAY_YU12')], + name_field = 'mode')) + +(pdef('WarpAffine', version=0, is_legacy=True) + .add_enum_alias('InterpolationMode', 'WarpPerspective', name_field='imode') + .add_enum_alias('BorderMode', 'WarpPerspective', name_field='border_mode') + .add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f')) + +(pdef('WarpAffine', version=1) + .add_enum_alias('InterpolationMode', 'WarpPerspective', name_field='imode') + .add_enum_alias('BorderMode', 'WarpPerspective', name_field='border_mode') + .add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f') + .add_enum_alias('Format', 'ConvolutionV0', default=1)) + +(pdef('GaussianBlur') + .add_enum_alias('BorderMode', 'WarpPerspective', name_field='border_mode') + .add_fields('uint32', 'kernel_height', 0, 'kernel_width', 0) + .add_fields('float32','sigma_x', '0.f', 'sigma_y', '0.f')) + +(pdef('Resize', version=0, is_legacy=True) + .add_enum_alias('InterpolationMode', 'WarpPerspective', name_field='imode')) + +(pdef('Resize', version=1) + .add_enum_alias('InterpolationMode', 'WarpPerspective', name_field='imode') + .add_enum_alias('Format', 'ConvolutionV0', default=1)) + +(pdef('Convolution3D'). + add_enum('Mode', 'CROSS_CORRELATION', 'CONVOLUTION'). + add_fields( + 'uint32', + Doc('pad_d', 'padding on one side on the first dimension'), 0, + Doc('pad_h', 'padding on one side on the second dimension'), 0, + Doc('pad_w', 'padding on one side on the third dimension'), 0, + Doc('stride_d', 'kernel stride on the first dimension'), 1, + Doc('stride_h', 'kernel stride on the second dimension'), 1, + Doc('stride_w', 'kernel stride on the third dimension'), 1, + Doc('dilate_d', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the first dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the third dimension'), 1 + ). + add_enum('Sparse', + Doc('DENSE', 'dense convolution: filter shape should be ' + '[oc, ic, spatial...] if format is NCDHW, ' + '[oc, spatial..., ic] if format is NDHWC'), + Doc('GROUP', 'group convolution: filter shape should be ' + '[group, oc_per_group, ic_per_group, spatial...] if format is NCDHW, ' + '[group, oc_per_group, spatial..., ic_per_group] if format is NDHWC') + ). + add_enum('DataType', + Doc('FLOAT', 'input/output both float32/float16'), + Doc('FLOAT_IO16xC32', 'input/output both float16, the internal ' + 'compute is float32'), + name_field='data_type'). + add_enum('Format', 'NCDHW', 'NDHWC') + ) + +(pdef('Conv3DBias'). + add_enum('NonlineMode', 'IDENTITY', 'RELU', 'SIGMOID'). + add_enum_alias('Mode', 'Convolution3D'). + add_fields('uint32', 'pad_d', 0, 'pad_h', 0, 'pad_w', 0, + 'stride_d', 1, 'stride_h', 1, 'stride_w', 0)) + +(pdef('SeparableConv3D'). + add_enum_alias('Mode', 'Convolution3D'). + add_enum('BorderMode', 'BORDER_REPLICATE', 'BORDER_REFLECT', + 'BORDER_REFLECT_101','BORDER_WRAP', + 'BORDER_CONSTANT', 'BORDER_TRANSPARENT','BORDER_ISOLATED'). + add_fields('bool', 'is_symm_kernel', 'true'). + add_fields('uint32', 'pad_d', 0, 'pad_h', 0, 'pad_w', 0, + 'stride_d', 0, 'stride_h', 1, 'stride_w', 1, + 'ksize_d', 0, 'ksize_h', 3, 'ksize_w', 3, + 'anchor_d', 0, 'anchor_h', 1, 'anchor_w', 1)) + +(pdef('TopK'). + add_enum( + 'Mode', + Doc('KTH_ONLY', "only the value of the k'th element would be computed"), + Doc('VALUE_IDX_NOSORT', + 'all the top-k values and corresponding indices would be computed; ' + 'no order is guaranteed'), + Doc('VALUE_IDX_SORTED', + 'all the top-k values and corresponding indices sorted')) + ) + +RELAYOUT_FORMAT_MODE_DOC = """ +Relayout mode. + +**Naming conventions** + +1. ``A_B`` means change from layout format ``A`` to ``B``. +2. ``INTER_WEIGHT_xx`` means relayout the weight for faster processing by + :attr:`Convolution.Format.NHWCD4` convolutions. +3. A suffix of ``I`` means ``Image2DPack4TensorFormat`` tensor format is used + for faster processing on GPUs. + +**Layout definitions** + +* ``NCHW`` layout: ``{N, C, H, W}`` +* ``NHWC`` layout: ``{N, H, W, C}`` +* ``NHWCD4`` layout: ``{N, H, (C + 3) / 4, W, 4}`` +* ``NHWCD4I`` layout: with ``align_axis = 2`` +* ``NCHW4`` layout: ``{N, C/4, H, W, 4}`` +* ``NCHW88`` layout: ``{N, C/8, H, W, 8}`` +* ``CHWN4`` layout: ``{C/4, H, W, N, 4}`` + +**Float weight transformation definitions** + ++---------------+---------------------------------+--------------------+--------------------------------------+------+ +| Sparsity Type | Input Layout | Input Req | Output Layout | Axis | ++===============+=================================+====================+======================================+======+ +| DENSE | ``{OC, IC, FH, FW}`` | ``OC % 4 == 0`` | ``{OC/4, FH, FW, IC, 4}`` | 3 | ++---------------+---------------------------------+--------------------+--------------------------------------+------+ +| GROUP | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 4 == 0`` | ``{GROUP, OCPG/4, FH, FW, ICPG, 4}`` | 4 | +| | | ``ICPG % 4 == 0`` | | | ++---------------+---------------------------------+--------------------+--------------------------------------+------+ +| CHAN | ``{GROUP, 1, 1, FH, FW}`` | ``GROUP % 4 == 0`` | ``{GROUP / 4, 1, FH ,FW, 4}`` | 1 | ++---------------+---------------------------------+--------------------+--------------------------------------+------+ + +**Float weight transformation nchw88 definitions** + ++---------------+---------------------------------+--------------------+--------------------------------------+ +| Sparsity Type | Input Layout | Input Req | Output Layout | ++===============+=================================+====================+======================================+ +| DENSE | ``{OC, IC, FH, FW}`` | ``OC % 8 == 0`` |``{OC/8, IC/8 ,FH, FW, 8(IC), 8(OC)}``| +| | | ``IC % 8 == 0`` | | ++---------------+---------------------------------+--------------------+--------------------------------------+ +| GROUP | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 8 == 0`` | ``{GROUP, OCPG/8, ICPG/8 FH, FW, | +| | | ``ICPG % 8 == 0`` | 8(ICPG), 8(OCPG)} `` | ++---------------+---------------------------------+--------------------+--------------------------------------+ +| CHAN | ``{GROUP, 1, 1, FH, FW}`` | ``GROUP % 8 == 0`` | ``{GROUP / 8, 1, FH ,FW, 8}`` | ++---------------+---------------------------------+--------------------+--------------------------------------+ + +**Int8(DOT) weight transformation definitions** + ++---------------+---------------------------------+--------------------+------------------------------------------+------+ +| Sparsity Type | Input Layout | Input Req | Output Layout | Axis | ++===============+=================================+====================+==========================================+======+ +| DENSE | ``{OC, IC, FH, FW}`` | ``OC % 4 == 0`` | ``{OC/4, FH, FW, IC/4, 4, 4}` | 3 | ++---------------+---------------------------------+--------------------+------------------------------------------+------+ +| GROUP | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 4 == 0`` | ``{GROUP, OCPG/4, FH, FW, ICPG/4, 4, 4}``| 4 | +| | | ``ICPG % 4 == 0`` | | | ++---------------+---------------------------------+--------------------+------------------------------------------+------+ + +Note: the axis column means the corresponding ``align_axis`` for image format +when the ``I`` suffix is present. + +""" +(pdef('RelayoutFormat', 'Change the tensor layout format'). + add_enum( + Doc('Mode', RELAYOUT_FORMAT_MODE_DOC), + 'NHWC_NHWCD4', + 'NHWCD4_NHWC', + 'NHWC_NHWCD4I', + 'NCHW_NHWCD4', + 'NCHW_NHWCD4I', + 'NHWCD4I_NCHW', + 'NHWCD4_NCHW', + 'INTER_WEIGHT_DENSE', + 'INTER_WEIGHT_DENSEI', + 'INTER_WEIGHT_GROUP', + 'INTER_WEIGHT_GROUPI', + 'INTER_WEIGHT_CHAN', + 'INTER_WEIGHT_CHANI', + 'INTER_WEIGHT_DENSEI_DOT', + 'INTER_WEIGHT_GROUPI_DOT', + 'NCHW4_CHWN4', + 'CHWN4_NCHW4', + 'NCHW_NCHW88_CONV_DENSE_WEIGHT', + 'NCHW_NCHW88_CONV_CHAN_WEIGHT', + 'NCHW_NCHW88_CONV_GROUP_WEIGHT', + 'NCHW_NCHW88', + 'NCHW88_NCHW') + ) + + +(pdef('SeparableFilter'). + add_enum_alias('Format', 'ConvolutionV0'). + add_enum_alias('BorderMode', 'WarpPerspective'). + add_fields('bool', 'is_symm_kernel', 'true'). + add_fields('uint32', 'ksize_h', 3, 'ksize_w', 3, 'anchor_h', 1, 'anchor_w', 1)) + +(pdef('LocalShare', 'Local share convolution'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('spatial_groups_h', 'spatial groups on the first dimension'), 1, + Doc('spatial_groups_w', 'spatial groups on the second dimension'), 1 + ). + add_enum_alias('Sparse', 'ConvolutionV0'). + add_enum_alias('Format', 'ConvolutionV0'). + add_enum_alias('ComputeMode', 'Convolution') + ) + +(pdef('ROIAlign'). + add_enum('Mode', 'MAX', 'AVERAGE', name_field='mode'). + add_enum_alias('Format', 'ConvolutionV0'). + add_fields('float32', 'spatial_scale', '1.0'). + add_fields('float32', 'offset', '0.0'). + add_fields('uint32', + 'pooled_height', '1', + 'pooled_width', '1', + 'sample_height', '2', + 'sample_width', '2') + ) +(pdef('DeformablePSROIPooling'). + add_fields('bool', 'no_trans', 'true'). + add_fields('float32', 'spatial_scale', 1, + 'trans_std', 1). + add_fields('uint32', + Doc('pooled_h', 'height of pooling output'), 1, + Doc('pooled_w', 'width of pooling output'), 1, + Doc('part_size', 'size of each deformable part'), 1, + Doc('sample_per_part', 'sample count of each bbox'), 1)) + +(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)'). + add_enum_alias('NonlineMode', 'ConvBiasV0'). + add_enum_alias('Mode', 'ConvolutionV0'). + add_fields( + 'uint32', + Doc('pad_h', 'padding on one side on the first dimension'), 0, + Doc('pad_w', 'padding on one side on the second dimension'), 0, + Doc('stride_h', 'kernel stride on the first dimension'), 1, + Doc('stride_w', 'kernel stride on the second dimension'), 1, + Doc('dilate_h', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + Doc('dilate_w', 'dilation (i.e. size of each zero-padded kernel block) ' + 'on the second dimension'), 1, + ). + add_enum_alias('Sparse', 'ConvolutionV0'). + add_enum_alias('Format', 'ConvolutionV0'). + add_enum_alias('ComputeMode', 'Convolution', name_field="compute_mode") + ) + + diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt new file mode 100644 index 00000000..2defb17e --- /dev/null +++ b/dnn/src/CMakeLists.txt @@ -0,0 +1,59 @@ + +set(LIBMEGDNN_DEF) +file(GLOB_RECURSE SOURCES common/*.cpp naive/*.cpp) + +if(NOT ${MGE_ARCH} STREQUAL "naive") + file(GLOB_RECURSE SOURCES_ fallback/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + if(${MGE_ARCH} STREQUAL "fallback") + message(WARNING "build only with fallback") + elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") + file(GLOB_RECURSE SOURCES_ x86/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + if(NOT MSVC) + file(GLOB_RECURSE SOURCES_ x86/*.S) + set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C) + list(APPEND SOURCES ${SOURCES_}) + endif() + endif() +endif() + +if(MGE_WITH_CUDA) + file(GLOB_RECURSE SOURCES_ cuda/*.cpp) + list(APPEND SOURCES ${SOURCES_}) + + file(GLOB_RECURSE CUSOURCES cuda/*.cu) + list(APPEND SOURCES ${CUSOURCES}) + list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_CUDA=1) +endif() + + + +add_definitions(${LIBMEGDNN_DEF}) +add_library(megdnn EXCLUDE_FROM_ALL STATIC ${SOURCES}) + +target_link_libraries(megdnn opr_param_defs) +target_include_directories(megdnn PUBLIC ${PROJECT_SOURCE_DIR}/dnn/include) +target_include_directories(megdnn PRIVATE ${PROJECT_SOURCE_DIR}/dnn ${PROJECT_SOURCE_DIR}/third_party/midout/src) + +install(DIRECTORY ${PROJECT_SOURCE_DIR}/dnn/include DESTINATION . FILES_MATCHING PATTERN "*.h*") + +if(CXX_SUPPORT_WCLASS_MEMACCESS) + if(MGE_WITH_CUDA) + target_compile_options(megdnn PRIVATE "$<$:-Xcompiler=-Wno-class-memaccess>" + "$<$>:-Wno-class-memaccess>") + else() + target_compile_options(megdnn PRIVATE "-Wno-class-memaccess") + endif() +endif() +target_compile_definitions(megdnn INTERFACE ${LIBMEGDNN_DEF}) + +if(MGE_WITH_MKLDNN AND ${MGE_ARCH} STREQUAL "x86_64") + target_link_libraries(megdnn libmkl_dnn) +endif() +target_link_libraries(megdnn ${MGE_CUDA_LIBS}) +target_link_libraries(megdnn ${MGE_BLAS_LIBS}) +if(CMAKE_THREAD_LIBS_INIT) + target_link_libraries(megdnn Threads::Threads) +endif() + diff --git a/dnn/src/common/add_update.cpp b/dnn/src/common/add_update.cpp new file mode 100644 index 00000000..d7fc6efc --- /dev/null +++ b/dnn/src/common/add_update.cpp @@ -0,0 +1,54 @@ +/** + * \file dnn/src/common/add_update.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" + +#include "src/common/add_update_helper.h" +#include "src/common/utils.h" + +namespace megdnn { + +void AddUpdateForward::check_exec(const TensorLayout& dst, + const TensorLayout& delta) { + // delta can not be broadcasted to dst if dst.total_nr_elems() < + // delta.total_nr_elems() + megdnn_assert(dst.dtype == delta.dtype && + dst.total_nr_elems() >= delta.total_nr_elems() && + dst.is_non_overlapping_strong()); + if (dst.dtype.category() == DTypeCategory::INT) { + auto check_fv = [](float fv) { + int iv = fv; + megdnn_assert( + float(iv) == fv && float(iv + 1) == fv + 1.f && + float(iv - 1) == fv - 1.f, + "bad arg value in AddUpdate: dtype is int, but value is %g " + "which can not be precisely converted to int", + fv); + }; + check_fv(m_param.alpha); + check_fv(m_param.beta); + check_fv(m_param.bias); + } +} + +ElemwiseOpParamN<2> AddUpdateForwardHelper::make_param( + _megdnn_tensor_inout dst, _megdnn_tensor_in delta) { + ElemwiseOpParamN<2> src; + src[0] = dst; + src[1] = delta; + src[1].layout = src[1].layout.broadcast(dst.layout); + src.init_from_given_tensor(); + + return src; +} +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/add_update_helper.h b/dnn/src/common/add_update_helper.h new file mode 100644 index 00000000..63157b07 --- /dev/null +++ b/dnn/src/common/add_update_helper.h @@ -0,0 +1,28 @@ +/** + * \file dnn/src/common/add_update_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" + +#include "src/common/elemwise_helper.cuh" + +namespace megdnn { + +class AddUpdateForwardHelper : public AddUpdateForward { + using AddUpdateForward::AddUpdateForward; + +protected: + ElemwiseOpParamN<2> make_param(_megdnn_tensor_inout dst, + _megdnn_tensor_in delta); +}; + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/algo_chooser.h b/dnn/src/common/algo_chooser.h new file mode 100644 index 00000000..49d449f0 --- /dev/null +++ b/dnn/src/common/algo_chooser.h @@ -0,0 +1,150 @@ +/** + * \file dnn/src/common/algo_chooser.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include +#include + +#include "utils.h" + +namespace megdnn { + +/*! + * \brief get user-configured algorithm, or heuristic algorithm + */ +template +typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { + typename Opr::Algorithm* ret; + if (auto set = opr->execution_policy().algorithm) { + ret = set; + } else { + ret = opr->get_algorithm_heuristic(std::forward(args)..., + std::numeric_limits::max(), + false); + } + return static_cast(ret); +} + +/*! + * \brief get all algorithms from algo_pack() that is available for current size + */ +template +std::vector get_all_algorithms( + const typename Opr::AlgoBase::SizeArgs& args) { + std::vector ret; + ret.reserve(Opr::algo_pack().all_algos.size()); + for (auto i : Opr::algo_pack().all_algos) { + if (i->is_available(args)) { + ret.push_back(i); + } + } + megdnn_assert(!ret.empty(), "no conv algorithm for %s", + args.to_string().c_str()); + return ret; +} + +/*! + * \brief a helper function to get a reproducible algorithm. If require a + * reproducible algorithm, and the given algorithm is reproducible, return the + * given algorithm. Otherwise return nullptr + */ +template +typename Opr::Algorithm* get_reproducible_algo(typename Opr::AlgoBase* algo, + bool reproducible) { + if (reproducible) { + if (algo->is_reproducible()) { + return algo; + } + } else { + return algo; + } + return nullptr; +} + +template +typename Opr::Algorithm* get_reproducible_algo( + const std::vector& algos, + const typename Opr::AlgoBase::SizeArgs& args, + size_t workspace_limit_in_bytes, const char* name) { + size_t min_workspace_limit_in_bytes = std::numeric_limits::max(); + bool available_but_limited_by_workspace = false; + bool available_but_not_reproducible = false; + for (auto i : algos) { + if (i->is_available_reproducible(args, true, + workspace_limit_in_bytes)) { + return i; + } + if (i->is_available_reproducible(args)) { + if (i->get_workspace_in_bytes(args) > workspace_limit_in_bytes) { + available_but_limited_by_workspace = true; + min_workspace_limit_in_bytes = + std::min(min_workspace_limit_in_bytes, + i->get_workspace_in_bytes(args)); + } + } + if (i->is_available(args)) { + if (!i->is_reproducible()) + available_but_not_reproducible = true; + } + } + + MEGDNN_MARK_USED_VAR(name); + if (available_but_limited_by_workspace) { + megdnn_throw(megdnn_mangle(ssprintf( + "no reproducible %s algorithm: %s workspace limit %zu is " + "less than mini workspace limit %zu", + name, args.to_string().c_str(), workspace_limit_in_bytes, + min_workspace_limit_in_bytes))); + } else if (available_but_not_reproducible) { + megdnn_throw( + megdnn_mangle(ssprintf("no reproducible %s algorithm", name))); + } else { + megdnn_throw(megdnn_mangle(ssprintf("no usable %s algorithm", name))); + } +} + +template +typename Opr::Algorithm* get_usable_algo( + const std::vector& algos, + const typename Opr::AlgoBase::SizeArgs& args, + size_t workspace_limit_in_bytes, const char* name) { + size_t min_workspace_limit_in_bytes = std::numeric_limits::max(); + bool available_but_limited_by_workspace = false; + for (auto i : algos) { + if (i->is_available_wk(args, workspace_limit_in_bytes)) { + return i; + } + if (i->is_available(args)) { + available_but_limited_by_workspace = true; + min_workspace_limit_in_bytes = + std::min(min_workspace_limit_in_bytes, + i->get_workspace_in_bytes(args)); + } + } + + MEGDNN_MARK_USED_VAR(name); + if (available_but_limited_by_workspace) { + megdnn_throw(megdnn_mangle(ssprintf( + "no usable %s algorithm: %s workspace limit %zu is " + "less than mini workspace limit %zu", + name, args.to_string().c_str(), workspace_limit_in_bytes, + min_workspace_limit_in_bytes))); + } else { + megdnn_throw(megdnn_mangle(ssprintf("no usable %s algorithm", name))); + } +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/argmxx/base_impl.cpp b/dnn/src/common/argmxx/base_impl.cpp new file mode 100644 index 00000000..8f41fc02 --- /dev/null +++ b/dnn/src/common/argmxx/base_impl.cpp @@ -0,0 +1,78 @@ +/** + * \file dnn/src/common/argmxx/base_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void ArgmxxBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(dst); + megdnn_assert(src.ndim > 0_z, "%s", errmsg().c_str()); + megdnn_assert(src.ndim == dst.ndim, "%s", errmsg().c_str()); + megdnn_assert(param().axis < static_cast(src.ndim), "%s", + errmsg().c_str()); + for (size_t i = 0; i < src.ndim; ++i) { + if (i != static_cast(param().axis)) { + megdnn_assert_eq_size_t(src.shape[i], dst.shape[i]); + } else { + megdnn_assert_eq_size_t(dst.shape[i], 1_z); + } + } + megdnn_assert(dst.dtype == dtype::Int32()); +} + +void ArgmaxForward::deduce_layout(const TensorLayout &src, + TensorLayout &dst) +{ + dst = src; + dst.shape[param().axis] = 1; + dst.dtype = dtype::Int32(); + dst.init_contiguous_stride(); +} + +void ArgmaxForward::check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void ArgminForward::deduce_layout(const TensorLayout &src, + TensorLayout &dst) +{ + dst = src; + dst.shape[param().axis] = 1; + dst.dtype = dtype::Int32(); + dst.init_contiguous_stride(); +} + +void ArgminForward::check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/argmxx_helper.h b/dnn/src/common/argmxx_helper.h new file mode 100644 index 00000000..87c69be3 --- /dev/null +++ b/dnn/src/common/argmxx_helper.h @@ -0,0 +1,89 @@ +/** + * \file dnn/src/common/argmxx_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "megdnn/dtype.h" + +#if MEGDNN_CC_HOST +#include "megdnn/basic_types.h" +#endif + +namespace megdnn { +namespace argmxx { + +template +struct ArgmxxOp { + struct wtype { + stype_ key; + dt_int32 val; + MEGDNN_HOST MEGDNN_DEVICE wtype() + {} + MEGDNN_HOST MEGDNN_DEVICE wtype(stype_ key, dt_int32 val): + key(key), val(val) + {} + MEGDNN_HOST MEGDNN_DEVICE wtype(wtype &rhs): + key(rhs.key), + val(rhs.val) + {} + MEGDNN_HOST MEGDNN_DEVICE wtype(volatile wtype &rhs): + key(rhs.key), + val(rhs.val) + {} + MEGDNN_HOST MEGDNN_DEVICE wtype(const wtype &rhs): + key(rhs.key), + val(rhs.val) + {} + MEGDNN_HOST MEGDNN_DEVICE wtype(const volatile wtype &rhs): + key(rhs.key), + val(rhs.val) + {} + MEGDNN_HOST MEGDNN_DEVICE volatile wtype &operator=(const wtype &rhs) volatile + { + this->key = rhs.key; + this->val = rhs.val; + return *this; + } + }; + MEGDNN_HOST MEGDNN_DEVICE + ArgmxxOp(stype_ *src, dt_int32 *dst, uint32_t A, uint32_t B, uint32_t C): + src(src), dst(dst), A(A), B(B), C(C), + INIT(wtype(is_max ? DTypeTrait::min() : + DTypeTrait::max(), -1)) + { + } + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) + { + wtype res; + res.key = src[idx]; + res.val = idx / C % B; + return res; + } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) + { + dst[idx] = val.val; + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) + { + if (is_max) { + if (lhs.key > rhs.key) return lhs; else return rhs; + } else { + if (lhs.key < rhs.key) return lhs; else return rhs; + } + } + stype_ *src; + dt_int32 *dst; + uint32_t A, B, C; + const wtype INIT; +}; + +} // namespace argmxx +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/argsort.cpp b/dnn/src/common/argsort.cpp new file mode 100644 index 00000000..5f135484 --- /dev/null +++ b/dnn/src/common/argsort.cpp @@ -0,0 +1,68 @@ +/** + * \file dnn/src/common/argsort.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" + +#include "src/common/utils.h" + +using namespace megdnn; + +void ArgsortForward::deduce_layout(const TensorLayout& src, TensorLayout& dst, + TensorLayout& indices) { + megdnn_assert(src.ndim == 2 && src.is_contiguous(), + "invalid src layout: %s", src.to_string().c_str()); + dst = src; + indices = src; + indices.dtype = dtype::Int32(); +} + +void ArgsortForward::check_exec(const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& indices, + size_t workspace_in_bytes) { + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst) + ", " + + megdnn_layout_msg(indices); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert(src.ndim == 2_z, "%s", errmsg().c_str()); + megdnn_assert_eq_layout(src, dst); + megdnn_assert_eq_shape(src, indices); + megdnn_assert_contiguous(indices); + + megdnn_assert(src.dtype == dst.dtype); + megdnn_assert(indices.dtype == dtype::Int32()); + + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, dst, indices); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void ArgsortBackward::check_exec(const TensorLayout& diff, + const TensorLayout& indices, + const TensorLayout& grad, + size_t workspace_in_bytes) { + megdnn_assert(diff.eq_shape(indices) && diff.dtype == grad.dtype && + indices.dtype == dtype::Int32{} && + diff.is_contiguous() && indices.is_contiguous() && + grad.is_contiguous() && diff.ndim == 2 && + grad.ndim == 2 && diff[0] == grad[0] && + diff[1] <= grad[1], + "invalid layouts: diff=%s indices=%s grad=%s", + diff.to_string().c_str(), indices.to_string().c_str(), + grad.to_string().c_str()); + auto required_workspace_in_bytes = + get_workspace_in_bytes(diff, indices, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/asm_common_defs.h b/dnn/src/common/asm_common_defs.h new file mode 100644 index 00000000..f6c76647 --- /dev/null +++ b/dnn/src/common/asm_common_defs.h @@ -0,0 +1,29 @@ +/** + * \file dnn/src/common/asm_common_defs.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#if defined(__WIN32__) || defined(__APPLE__) +# define cdecl(s) _##s +#else +# define cdecl(s) s +#endif + +#if !defined(__APPLE__) +#define hidden_sym(s) .hidden cdecl(s) +#else +#define hidden_sym(s) .private_extern cdecl(s) +#endif + +#if defined(__linux__) && defined(__ELF__) && (defined(__arm__) || defined(__aarch64__)) +.pushsection .note.GNU-stack,"",%progbits +.popsection +#endif + diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp new file mode 100644 index 00000000..e9b90d0e --- /dev/null +++ b/dnn/src/common/basic_types.cpp @@ -0,0 +1,510 @@ +/** + * \file dnn/src/common/basic_types.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/basic_types.h" +#include "megdnn/tensor_format.h" + +#include "src/common/utils.h" + +#include +#include +#include +#include +#include +#include + +using namespace megdnn; + +/* ===================== ErrorHandler ===================== */ +namespace { +class DefaultErrorHandler final : public ErrorHandler { + void do_on_megdnn_error(const std::string& msg) override { + megdnn_ignore(msg); +#if MEGDNN_ENABLE_EXCEPTIONS + throw std::runtime_error{msg}; +#else + megdnn_trap(); +#endif + } +}; +} // namespace +ErrorHandler* ErrorHandler::sm_inst; + +ErrorHandler* ErrorHandler::inst() { + static std::mutex mtx; + static DefaultErrorHandler default_handler; + if (megdnn_unlikely(!sm_inst)) { + std::lock_guard lg{mtx}; + if (!sm_inst) { + sm_inst = &default_handler; + } + } + return sm_inst; +} + +void ErrorHandler::on_megdnn_error(const std::string& msg) { + inst()->do_on_megdnn_error(msg); + + // gcc seems to fail to recognize the noreturn attr of + // do_on_tensor_reshape_error; explicitly mark this function as noreturn + // here + megdnn_trap(); +} + +void ErrorHandler::on_megdnn_error(const char* msg) { + on_megdnn_error(std::string{msg}); +} + +void ErrorHandler::on_tensor_reshape_error(const std::string& msg) { + inst()->do_on_tensor_reshape_error(msg); + megdnn_trap(); +} + +void ErrorHandler::on_tensor_reshape_error(const char* msg) { + on_tensor_reshape_error(std::string{msg}); +} + +void ErrorHandler::set_handler(ErrorHandler* handler) { + sm_inst = handler; +} + +/* ===================== logging ===================== */ + +namespace { +LogHandler g_log_handler = nullptr; +} // anonymous namespace + +#if MEGDNN_ENABLE_LOGGING +void megdnn::__log__(LogLevel level, const char* file, const char* func, + int line, const char* fmt, ...) { + if (!g_log_handler) + return; + va_list ap; + va_start(ap, fmt); + g_log_handler(level, file, func, line, fmt, ap); + va_end(ap); +} +#endif // MEGDNN_ENABLE_LOGGING + +LogHandler megdnn::set_log_handler(LogHandler handler) { + auto ret = g_log_handler; + g_log_handler = handler; + return ret; +} + +/* ===================== TensorShape ===================== */ + +TensorShape::TensorShape(const SmallVector& init_shape) { + megdnn_assert(init_shape.size() <= MAX_NDIM, + "Illegal to construct a TensorShape with " + "more than MAX_NDIM(%zu) axes; init_shape is %s", + MAX_NDIM, vec2str(init_shape).c_str()); + ndim = init_shape.size(); + memcpy(this->shape, init_shape.data(), sizeof(size_t) * ndim); +} + +TensorShape::TensorShape(std::initializer_list init_shape) + : TensorShape(SmallVector{init_shape}) {} + +size_t TensorShape::total_nr_elems() const { + if (!ndim) + return 0; + return std::accumulate(shape, shape + ndim, 1_z, SafeMultiplies()); +} + +bool TensorShape::eq_shape(const TensorShape& rhs) const { + MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code"); + if (ndim == rhs.ndim) { + size_t eq = 0; + switch (ndim) { + case 7: + eq += shape[6] == rhs.shape[6]; MEGDNN_FALLTHRU + case 6: + eq += shape[5] == rhs.shape[5]; MEGDNN_FALLTHRU + case 5: + eq += shape[4] == rhs.shape[4]; MEGDNN_FALLTHRU + case 4: + eq += shape[3] == rhs.shape[3]; MEGDNN_FALLTHRU + case 3: + eq += shape[2] == rhs.shape[2]; MEGDNN_FALLTHRU + case 2: + eq += shape[1] == rhs.shape[1]; MEGDNN_FALLTHRU + case 1: + eq += shape[0] == rhs.shape[0]; + } + return eq == ndim; + } + return false; +} + +std::string TensorShape::to_string() const { + std::string rst("{"); + for (size_t i = 0; i < ndim; i++) { + if (i) + rst.append(","); + rst.append(std::to_string(shape[i])); + } + rst.append("}"); + return rst; +} + +bool TensorShape::is_empty() const { + for (size_t i = 0; i < ndim; ++i) { + if (!shape[i]) { + return true; + } + } + return false; +} + +/* ===================== TensorLayout ===================== */ +TensorLayout::TensorLayout() = default; + +TensorLayout::TensorLayout(DType dtype_) : dtype{dtype_} {} + +TensorLayout::TensorLayout(DType dtype_, Format format_) + : dtype{dtype_}, format{format_} {} + +TensorLayout::TensorLayout(const TensorShape& shape, DType dtype) + : TensorLayout(shape, dtype, DefaultTensorFormat::make()) {} + +TensorLayout::TensorLayout(const TensorShape& shape, DType dtype, + TensorFormat format_) + : TensorShape(shape), dtype{dtype}, format{format_} { + init_contiguous_stride(); +} + +TensorLayout::TensorLayout(const TensorShape& shape, + const std::vector& stride, DType dtype) + : TensorLayout(shape, stride, dtype, DefaultTensorFormat::make()) {} + +TensorLayout::TensorLayout(const TensorShape& shape, + const std::vector& stride, DType dtype, + TensorFormat format_) + : TensorShape(shape), dtype{dtype}, format{format_} { + megdnn_assert_eq_size_t(stride.size(), ndim); + for (size_t i = 0; i < shape.ndim; ++i) + this->stride[i] = stride[i]; +} + +size_t TensorLayout::init_contiguous_stride() { + return format.impl()->init_contiguous_stride(*this); +} + +size_t TensorLayout::init_contiguous_stride(const TensorShape& shape) { + this->TensorShape::operator=(shape); + return init_contiguous_stride(); +} + +size_t TensorLayout::init_contiguous_stride(const TensorShape& shape, + TensorFormat format_) { + this->TensorShape::operator=(shape); + this->format = format_; + return init_contiguous_stride(); +} + +TensorLayout TensorLayout::dimshuffle(const std::vector& dims) const { + TensorLayout res{dtype, format}; + res.ndim = this->ndim; + megdnn_assert_eq_size_t(dims.size(), this->ndim); + auto ndim = this->ndim; + rep(i, ndim) { + auto dest = dims[i]; + megdnn_assert(dest < ndim); + res.shape[i] = this->shape[dest]; + res.stride[i] = this->stride[dest]; + } + return res; +} + +TensorLayout TensorLayout::remove_axis(size_t idx) const { + TensorLayout res{*this}; + res.remove_axis_inplace(idx); + return res; +} + +void TensorLayout::remove_axis_inplace(size_t axis) { + megdnn_assert(ndim >= 2 && axis < ndim); + --ndim; + for (size_t i = axis; i < ndim; ++i) { + shape[i] = shape[i + 1]; + stride[i] = stride[i + 1]; + } +} + +void TensorLayout::add_axis_inplace(size_t axis, size_t shape, + ptrdiff_t stride) { + megdnn_assert(ndim + 1 <= MAX_NDIM && axis <= ndim && shape, + "can not add axis at %zu (current ndim %zu, MAX_NDIM %zu)", + axis, ndim, MAX_NDIM); + ndim++; + for (size_t i = ndim - 1; i > axis; i--) { + this->shape[i] = this->shape[i - 1]; + this->stride[i] = this->stride[i - 1]; + } + this->shape[axis] = shape; + this->stride[axis] = stride; +} + +bool TensorLayout::is_contiguous() const { + return format.impl()->is_contiguous_spec(*this); +} + +bool TensorLayout::is_physical_contiguous() const { + ptrdiff_t expected = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (shape[i] != 1 && stride[i] != expected) + return false; + expected *= shape[i]; + } + // empty tensors are not contiguous + return expected != 0; +} + +bool TensorLayout::is_abs_monotonous_allow_brdcst() const { + if (!ndim) + return false; + if (ndim == 1) + return true; + ptrdiff_t last = std::abs(stride[ndim - 1]) * + static_cast(shape[ndim - 1]); + for (int i = ndim - 2; i >= 0; --i) { + if (!stride[i] || shape[i] == 1) + continue; + if (std::abs(stride[i]) < last) + return false; + last = std::abs(stride[i]) * static_cast(shape[i]); + } + return true; +} + +bool TensorLayout::is_contiguous_allow_brdcst() const { + if (!ndim) + return false; + ptrdiff_t expected = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (!stride[i]) + continue; + if (shape[i] != 1 && stride[i] != expected) + return false; + expected *= shape[i]; + } + // empty tensors are not contiguous + return expected != 0; +} + +/** + * \brief The collapse_contiguous function will convert a contiguous image like + * tensor layout into a 2-dimensional layout, shape[0] = height of the image, + * shape[1] = width of the image, axis = 1, stride[0] = row_pitch_size_in_elem, + * and stride[1] = 1. + * So if the nhwcd4 format layout is transformed into a 2d tensor + * layout after calling this function, the nhwcd4 format layout is contiguous. + */ +TensorLayout TensorLayout::collapse_contiguous() const { + return format.impl()->collapse_contiguous_spec(*this); +} + +bool TensorLayout::is_non_overlapping_strong() const { + // abs(stride), stride, shape + std::array, MAX_NDIM> vec; + for (size_t i = 0; i < this->ndim; ++i) { + vec[i] = std::make_tuple(std::abs(stride[i]), stride[i], shape[i]); + } + std::sort(vec.begin(), vec.begin() + this->ndim); + ptrdiff_t lo = 0, hi = 0; + for (size_t i = 0; i < this->ndim; ++i) { + auto cur_stride = std::get<1>(vec[i]); + auto cur_shape = std::get<2>(vec[i]); + megdnn_assert(cur_shape > 0); + if (cur_shape == 1) + continue; + if (cur_stride > 0) { + if (cur_stride <= hi) + return false; + hi += cur_stride * (cur_shape - 1); + } else { + // cur_stride == 0 is handled here, which causes returning false + if (lo <= cur_stride) + return false; + lo += cur_stride * (cur_shape - 1); + } + } + return true; +} + +bool TensorLayout::eq_layout(const TensorLayout& rhs) const { + megdnn_assert(dtype == rhs.dtype, + "could not compare layout on different dtypes: %s vs %s", + dtype.name(), rhs.dtype.name()); + MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code"); + + auto ax = [](size_t shape0, size_t shape1, ptrdiff_t stride0, + ptrdiff_t stride1) { + return (shape0 == shape1) & ((shape0 == 1) | (stride0 == stride1)); + }; + if (ndim == rhs.ndim) { + size_t eq = 0; + switch (ndim) { + case 7: + eq += ax(shape[6], rhs.shape[6], stride[6], rhs.stride[6]); + MEGDNN_FALLTHRU + case 6: + eq += ax(shape[5], rhs.shape[5], stride[5], rhs.stride[5]); + MEGDNN_FALLTHRU + case 5: + eq += ax(shape[4], rhs.shape[4], stride[4], rhs.stride[4]); + MEGDNN_FALLTHRU + case 4: + eq += ax(shape[3], rhs.shape[3], stride[3], rhs.stride[3]); + MEGDNN_FALLTHRU + case 3: + eq += ax(shape[2], rhs.shape[2], stride[2], rhs.stride[2]); + MEGDNN_FALLTHRU + case 2: + eq += ax(shape[1], rhs.shape[1], stride[1], rhs.stride[1]); + MEGDNN_FALLTHRU + case 1: + eq += ax(shape[0], rhs.shape[0], stride[0], rhs.stride[0]); + } + return eq == ndim; + } + return false; +} + +TensorLayout::Span TensorLayout::span() const { + return format.impl()->span_spec(*this); +} + +TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const { + megdnn_throw_if(!ndim || !tshape.ndim, tensor_reshape_error, + megdnn_mangle("broadcast involves empty tensor")); + + if (is_scalar()) { + TensorLayout result{dtype, format}; + result.ndim = tshape.ndim; + for (size_t i = 0; i < tshape.ndim; i++) { + megdnn_throw_if(!tshape.shape[i], tensor_reshape_error, + megdnn_mangle("target shape is 0")); + result.shape[i] = tshape.shape[i]; + result.stride[i] = (tshape.shape[i] == 1); + } + return result; + } + + megdnn_throw_if(tshape.ndim < ndim, tensor_reshape_error, + megdnn_mangle(ssprintf( + "dimension for broadcast less than " + "dst_shape: src_shape=%s dst_shape=%s", + to_string().c_str(), tshape.to_string().c_str()))); + TensorLayout result{dtype, format}; + for (size_t i = 0; i < tshape.ndim; ++i) { + int target_idx = tshape.ndim - i - 1; + int cur_idx = ndim - i - 1; + megdnn_throw_if(!tshape.shape[target_idx], tensor_reshape_error, + megdnn_mangle("target shape is 0")); + size_t cur_shape = (cur_idx >= 0 ? shape[cur_idx] : 1), + cur_stride = (cur_idx >= 0 ? stride[cur_idx] : 0); + if (tshape.shape[target_idx] != cur_shape) { + megdnn_throw_if( + cur_shape != 1 && cur_stride != 0, tensor_reshape_error, + megdnn_mangle(ssprintf( + "brodcast on dim with shape not equal to 1: " + "src_shape=%s dst_shape=%s", + to_string().c_str(), tshape.to_string().c_str()))); + result.shape[target_idx] = tshape.shape[target_idx]; + result.stride[target_idx] = 0; + } else { + result.shape[target_idx] = cur_shape; + result.stride[target_idx] = cur_stride; + } + } + result.ndim = tshape.ndim; + return result; +} + +bool TensorLayout::try_reshape(TensorLayout& result, + const TensorShape& tshp) const { + megdnn_assert(tshp.ndim); + for (size_t i = 0; i < tshp.ndim; ++i) { + megdnn_throw_if(!tshp.shape[i], tensor_reshape_error, + megdnn_mangle(ssprintf("bad target tshp: %s", + tshp.to_string().c_str()))); + } + + megdnn_throw_if( + !tshp.ndim || total_nr_elems() != tshp.total_nr_elems(), + tensor_reshape_error, + megdnn_mangle(ssprintf( + "number of elements do not match " + "in reshape: src=%s dest=%s", + static_cast(*this).to_string().c_str(), + tshp.to_string().c_str()))); + + auto cont = collapse_contiguous(); + result.dtype = this->dtype; + result.format = this->format; + result.TensorShape::operator=(tshp); + + size_t sdim = 0, prod = 1, cont_sdim = 0; + for (size_t i = 0; i < tshp.ndim; ++i) { + megdnn_assert(cont_sdim < cont.ndim); + prod *= result.shape[i]; + if (prod > cont.shape[cont_sdim]) + return false; + + if (prod == cont.shape[cont_sdim] && + (i + 1 >= tshp.ndim || tshp.shape[i + 1] != 1)) { + auto s = cont.stride[cont_sdim]; + for (int j = i; j >= static_cast(sdim); --j) { + result.stride[j] = s; + s *= result.shape[j]; + } + ++cont_sdim; + sdim = i + 1; + prod = 1; + } + } + megdnn_assert(cont_sdim == cont.ndim); + + return true; +} + +TensorLayout TensorLayout::reshape(const TensorShape& shape) const { + TensorLayout ret; + auto succ = try_reshape(ret, shape); + megdnn_throw_if(!succ, tensor_reshape_error, + megdnn_mangle(ssprintf("can not reshape from %s to %s", + to_string().c_str(), + shape.to_string().c_str()))); + return ret; +} + +std::string TensorLayout::to_string() const { + std::string rst("{"); + for (size_t i = 0; i < ndim; i++) { + if (i) + rst.append(","); + rst.append(std::to_string(shape[i])); + + rst.push_back('('); + rst.append(std::to_string(stride[i])); + rst.push_back(')'); + } + if (format.type() != Format::Type::DEFAULT) { + rst.append(" @ "); + rst.append(format.impl()->to_string()); + } + rst.append("}"); + return rst; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/batch_conv_bias.cpp b/dnn/src/common/batch_conv_bias.cpp new file mode 100644 index 00000000..485fd3ca --- /dev/null +++ b/dnn/src/common/batch_conv_bias.cpp @@ -0,0 +1,95 @@ +/** + * \file dnn/src/common/batch_conv_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" +#include "megdnn/oprs/nn_int.h" +#include "src/common/utils.h" + +namespace megdnn { +void BatchConvBiasForward::deduce_dtype(DType src, DType filter, + DType /* bias */, DType /* z */, + DType& dst) { + check_or_deduce_dtype_fwd(src, filter, dst); +} + +void BatchConvBiasForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& /* bias */, + const TensorLayout& /* z */, + TensorLayout& dst) { + TensorLayout non_batch_filter; + non_batch_filter.ndim = filter.ndim - 1; + non_batch_filter.dtype = filter.dtype; + for (size_t i = 0; i < non_batch_filter.ndim; i++) { + non_batch_filter[i] = filter[i + 1]; + non_batch_filter.stride[i] = filter.stride[i + 1]; + } + non_batch_filter.format = filter.format; + deduce_layout_fwd(src, non_batch_filter, dst); +} + +BatchConvBiasForward::CanonizedFilterMeta BatchConvBiasForward::check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_in_bytes) { + megdnn_assert(src.dtype.enumv() == filter.dtype.enumv() && + src.dtype.enumv() == DTypeEnum::QuantizedS8, + "batch conv only support qint8"); + float scale_src = src.dtype.param().scale; + float scale_filter = filter.dtype.param().scale; + float scale_bias = bias.dtype.param().scale; + megdnn_assert( + std::abs(scale_src * scale_filter - scale_bias) < 1e-6, + "scale_bias is not equal to the product of scale_src and " + "scale_filter (scale_src: %f scale_filter: %f scale_bias: %f).", + scale_src, scale_filter, scale_bias); + TensorLayout non_batch_filter; + non_batch_filter.ndim = filter.ndim - 1; + non_batch_filter.dtype = filter.dtype; + for (size_t i = 0; i < non_batch_filter.ndim; i++) { + non_batch_filter[i] = filter[i + 1]; + non_batch_filter.stride[i] = filter.stride[i + 1]; + } + non_batch_filter.format = filter.format; + auto ret = check_layout_fwd(src, non_batch_filter, dst); + megdnn_assert_contiguous(bias); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, filter, bias, z, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + if (bias.ndim != 0) { + //! bias.layout == dst.layout failed, no assert information + auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) { + if (dst.dtype.category() == DTypeCategory::QUANTIZED) { + return bias.eq_shape(dst); + } else { + return bias.eq_layout(dst); + } + }; + if (check_eq(bias, dst)) + return ret; + if (param().format == param::BatchConvBias::Format::NCHW4) { + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == 1); + megdnn_assert(bias.shape[4] == 4); + } + } + + if (z.ndim != 0) { + megdnn_assert(z.dtype.enumv() == dst.dtype.enumv()); + megdnn_assert(z.eq_shape(dst)); + } + return ret; +} +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/batch_normalization.cpp b/dnn/src/common/batch_normalization.cpp new file mode 100644 index 00000000..a79c0f39 --- /dev/null +++ b/dnn/src/common/batch_normalization.cpp @@ -0,0 +1,64 @@ +/** + * \file dnn/src/common/batch_normalization.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void BNForward::deduce_layout(const TensorLayout& src, TensorLayout&, + TensorLayout&, TensorLayout&, TensorLayout&, + TensorLayout&, TensorLayout&, TensorLayout& dst) { + dst = src; +} + +void BNForward::check_exec(const TensorLayout& src, const TensorLayout& bn_scale, + const TensorLayout& bn_bias, const TensorLayout& mean, + const TensorLayout& variance, + const TensorLayout& batch_mean, + const TensorLayout& batch_inv_variance, + const TensorLayout& dst, size_t workspace_in_bytes) { + megdnn_assert_contiguous(src); + megdnn_assert_eq_layout(src, dst); + megdnn_assert_eq_layout(bn_scale, bn_bias); + + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(bn_scale.dtype.category() == DTypeCategory::FLOAT); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, bn_scale, bn_bias, mean, variance, + batch_mean, batch_inv_variance, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void BNBackward::check_exec(const TensorLayout& x, const TensorLayout& dy, + const TensorLayout& saved_batch_mean, + const TensorLayout& saved_batch_variance, + const TensorLayout& bn_scale, + const TensorLayout& d_bn_scale, + const TensorLayout& d_bn_bias, + const TensorLayout& dx, size_t workspace_in_bytes) { + megdnn_assert_contiguous(x); + megdnn_assert_eq_layout(x, dy); + megdnn_assert_eq_layout(x, dx); + megdnn_assert_eq_layout(saved_batch_mean, d_bn_bias); + megdnn_assert_eq_layout(saved_batch_mean, d_bn_scale); + megdnn_assert_eq_layout(saved_batch_mean, saved_batch_variance); + megdnn_assert_eq_layout(saved_batch_mean, bn_scale); + megdnn_assert(x.dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(bn_scale.dtype.category() == DTypeCategory::FLOAT); + auto required_workspace_in_bytes = + get_workspace_in_bytes(x, dy, saved_batch_mean, saved_batch_variance, + bn_scale, d_bn_scale, d_bn_bias, dx); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/batched_matrix_mul.cpp b/dnn/src/common/batched_matrix_mul.cpp new file mode 100644 index 00000000..d1093742 --- /dev/null +++ b/dnn/src/common/batched_matrix_mul.cpp @@ -0,0 +1,97 @@ +/** + * \file dnn/src/common/batched_matrix_mul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +namespace megdnn { + +void BatchedMatrixMulForward::deduce_dtype(DType A, DType B, DType &C) { + DType C_candi, C_candi2; + if (A.category() == DTypeCategory::FLOAT) { + C_candi = A; + } else if (A.enumv() == DTypeEnum::Int8) { + C_candi = dtype::Int32(); + C_candi2 = dtype::Int16(); + } else if (A.enumv() == DTypeEnum::QuantizedS8) { + C_candi = dtype::QuantizedS32(mul_scale(A, B)); + } else if (A.enumv() == DTypeEnum::Quantized8Asymm) { + C_candi = dtype::QuantizedS32(mul_scale(A, B)); + } else if (A.enumv() == DTypeEnum::Quantized4Asymm) { + C_candi = dtype::QuantizedS32(mul_scale(A, B)); + } + if (!C.valid()) { + C = C_candi; + } + megdnn_assert(C.valid() && (C == C_candi || C == C_candi2), + "unsupported BatchedMatMul(%s, %s) -> %s", A.name(), B.name(), + C.name()); +} +void BatchedMatrixMulForward::deduce_layout(const TensorLayout& A, + const TensorLayout& B, + TensorLayout& C) { + auto errmsg = [&]() { + std::string msg; + msg.append(megdnn_mangle("A=")); + msg.append(A.to_string()); + msg.append(megdnn_mangle(", B=")); + msg.append(B.to_string()); + msg.append(megdnn_mangle(", C=")); + msg.append(C.to_string()); + msg.append(megdnn_mangle(", transposeA=")); + msg.append(std::to_string(m_param.transposeA)); + msg.append(megdnn_mangle(", transposeB=")); + msg.append(std::to_string(m_param.transposeB)); + return msg; + }; + MEGDNN_MARK_USED_VAR(errmsg); + auto good_layout = [](const TensorLayout& l) { + // l.stride[0] == 0 because im2col conv need batched matrixmul and + // filter tensor need to be broadcasted. It's only implemented in + // opencl. + return l.ndim == 3 && l.stride[2] == 1 && + l.stride[1] >= static_cast(l.shape[2]) && + (l.shape[0] == 1 || + l.stride[0] >= + static_cast(l.shape[1]) * l.stride[1] || + l.stride[0] == 0); + }; + size_t A0, A1, B0, B1; + A0 = A.shape[1]; + A1 = A.shape[2]; + B0 = B.shape[1]; + B1 = B.shape[2]; + if (m_param.transposeA) + std::swap(A0, A1); + if (m_param.transposeB) + std::swap(B0, B1); + deduce_dtype(A.dtype, B.dtype, C.dtype); + megdnn_assert(good_layout(A) && good_layout(B) && A1 == B0 && + A[0] == B[0] && A.dtype.enumv() == B.dtype.enumv(), + "bad input layouts: %s", errmsg().c_str()); + C = TensorLayout(TensorShape({A[0], A0, B1}), C.dtype); +} + +void BatchedMatrixMulForward::check_exec(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C, + size_t workspace_in_bytes) { + TensorLayout C_expect; + deduce_layout(A, B, C_expect); + megdnn_assert(C_expect.eq_layout(C), "bad layout for C: expect=%s got=%s", + C_expect.to_string().c_str(), C.to_string().c_str()); + auto required_workspace_in_bytes = get_workspace_in_bytes(A, B, C); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes, + "needed workspace: %zu; got: %zu", + required_workspace_in_bytes, workspace_in_bytes); +} +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/checksum.cpp b/dnn/src/common/checksum.cpp new file mode 100644 index 00000000..7a80403f --- /dev/null +++ b/dnn/src/common/checksum.cpp @@ -0,0 +1,28 @@ +/** + * \file dnn/src/common/checksum.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +using namespace megdnn; + +void megdnn::ChecksumForward::check_exec(const TensorLayout &layout, + size_t workspace_in_bytes) { + megdnn_assert(layout.is_contiguous() && + layout.ndim == 1 && + layout.dtype == dtype::Byte() && + layout.shape[0], "%s", layout.to_string().c_str()); + auto required_workspace_in_bytes = get_workspace_in_bytes(layout); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/concat_split.cpp b/dnn/src/common/concat_split.cpp new file mode 100644 index 00000000..cea107c1 --- /dev/null +++ b/dnn/src/common/concat_split.cpp @@ -0,0 +1,113 @@ +/** + * \file dnn/src/common/concat_split.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +#include + +namespace megdnn { + +ConcatSplitBase::ConcatSplitBase(Handle *handle): + OperatorBase(handle), + m_get_layout([](const TensorND &tensor) { return tensor.layout; }), + m_get_shape([](const TensorLayout &layout) { return TensorShape(layout); }) +{ +} + +void ConcatSplitBase::check_layout_common(const TensorLayoutArray &srcs, + const TensorLayout &dst) +{ + // ensure same data type + for (auto &&src: srcs) { + megdnn_assert(src.dtype == dst.dtype); + } + // ensure all layouts are contiguous + for (auto &&src: srcs) { + megdnn_assert_contiguous(src); + } + megdnn_assert_contiguous(dst); + // ensure all layouts have the same ndim + auto ndim = dst.ndim; + for (auto &&src: srcs) { + megdnn_assert_eq_size_t(src.ndim, ndim); + } + // ensure param().axis is correct + auto errmsg = megdnn_mangle("param().axis=") + + std::to_string(param().axis) + megdnn_mangle(", ndim=") + + std::to_string(ndim); + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert(param().axis < static_cast(ndim), "%s", + errmsg.c_str()); + // ensure shape size for each axis is correct + for (size_t i = 0; i < ndim; ++i) { + if (i == static_cast(param().axis)) { + size_t sum = 0_z; + for (auto &&src: srcs) sum += src.shape[i]; + megdnn_assert_eq_size_t(sum, dst.shape[i]); + } else { + for (auto &&src: srcs) { + megdnn_assert(src.shape[i] == dst.shape[i]); + megdnn_assert_eq_size_t(src.shape[i], dst.shape[i]); + } + } + } +} + +void ConcatSplitBase::get_ABC(const TensorShapeArray &srcs, + size_t &A, + size_t *B, + size_t &C) +{ + auto axis = param().axis; + auto shape_arr = srcs[0].shape; + auto ndim = srcs[0].ndim; + A = std::accumulate(shape_arr, shape_arr + axis, + 1_z, SafeMultiplies()); + for (size_t i = 0u; i < srcs.size(); ++i) { + B[i] = srcs[i].shape[axis]; + } + C = std::accumulate(shape_arr + (axis+1), shape_arr + ndim, + 1_z, SafeMultiplies()); +} + +void ConcatForward::deduce_layout(const TensorLayoutArray &srcs, + TensorLayout &dst) +{ + dst = srcs[0]; + auto i = param().axis; + dst.shape[i] = 0u; + for (auto &&src: srcs) { + dst.shape[i] += src.shape[i]; + } + dst.init_contiguous_stride(); +} + +void ConcatForward::check_exec(const TensorLayoutArray &srcs, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_common(srcs, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void SplitForward::check_exec(const TensorLayout &src, + const TensorLayoutArray &dsts, + size_t workspace_in_bytes) +{ + check_layout_common(dsts, src); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dsts); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cond_take/opr_impl.cpp b/dnn/src/common/cond_take/opr_impl.cpp new file mode 100644 index 00000000..3d5e8c43 --- /dev/null +++ b/dnn/src/common/cond_take/opr_impl.cpp @@ -0,0 +1,36 @@ +/** + * \file dnn/src/common/cond_take/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +using namespace megdnn; + +size_t CondTake::check_exec_get_size(const TensorLayout& data, + const TensorLayout& mask, + size_t workspace_in_bytes) { + megdnn_assert(data.eq_shape(mask), + "CondTake shape differs: data=%s mask=%s", + data.TensorShape::to_string().c_str(), + mask.TensorShape::to_string().c_str()); + megdnn_assert(data.is_physical_contiguous() && + mask.is_physical_contiguous()); + megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", + m_param.eps); + megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data)); + return data.total_nr_elems(); +} + +CondTake::OutputDType CondTake::infer_dtype(DType data, DType /*mask*/) { + return {{data, dtype::Int32()}}; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cond_take/predicate.cuh b/dnn/src/common/cond_take/predicate.cuh new file mode 100644 index 00000000..75359a6f --- /dev/null +++ b/dnn/src/common/cond_take/predicate.cuh @@ -0,0 +1,115 @@ +/** + * \file dnn/src/common/cond_take/predicate.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/common/opr_param_defs_enumv.cuh" +#include "megdnn/arch.h" + +#if MEGDNN_CC_HOST +#include "megdnn/opr_param_defs.h" +#endif + +#ifndef __device__ +#define __device__ +#define __host__ +#define def_device +#endif + +#include + +namespace megdnn { +namespace cond_take { + typedef param_enumv::CondTake::Mode PEnum; + + struct KParam { + float val, eps; +#if MEGDNN_CC_HOST + KParam(const param::CondTake &p): + val(p.val), eps(p.eps) + {} +#endif + }; + + template + struct Pred; + +#define do_inst_eq_f(_ct) \ + template<> \ + struct Pred { \ + typedef _ct ctype; \ + ctype val, eps; \ + Pred(const KParam &p): val(p.val), eps(p.eps) {} \ + __device__ __host__ bool operator() (ctype x) const { \ + return fabsf(val - x) < eps; \ + } \ + }; + +#define do_inst_eq_i(_ct) \ + template<> \ + struct Pred { \ + typedef _ct ctype; \ + ctype val; \ + Pred(const KParam &p): val(p.val) {} \ + __device__ __host__ bool operator() (ctype x) const { \ + return val == x; \ + } \ + }; + +#define inst_eq_f(_dt) do_inst_eq_f(DTypeTrait<_dt>::ctype) +#define inst_eq_i(_dt) do_inst_eq_i(DTypeTrait<_dt>::ctype) + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(inst_eq_f) + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(inst_eq_i) +#undef inst_eq_f +#undef inst_eq_i + + template + struct Pred { + typedef ctype_ ctype; + Pred eq; + + Pred(const KParam &p): eq(p) {} + + __device__ __host__ bool operator() (ctype x) const { + return !this->eq(x); + } + }; + +#define DEF_OP(_name, _op) \ + template \ + struct Pred { \ + typedef ctype_ ctype; \ + ctype val; \ + Pred(const KParam &p): val(p.val) {} \ + __device__ __host__ bool operator() (ctype x) const { \ + return x _op val; \ + } \ + } + + DEF_OP(LT, < ); + DEF_OP(LEQ, <= ); + DEF_OP(GT, > ); + DEF_OP(GEQ, >= ); + +#undef DEF_OP + +#define MEGDNN_FOREACH_COND_TAKE_MODE(cb) \ + cb(EQ) cb(NEQ) cb(LT) cb(LEQ) cb(GT) cb(GEQ) + +} // namespace cond_take +} // namespace megdnn + +#ifdef def_device +#undef __device__ +#undef __host__ +#endif + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/conv_bias.cpp b/dnn/src/common/conv_bias.cpp new file mode 100644 index 00000000..bf1fc50e --- /dev/null +++ b/dnn/src/common/conv_bias.cpp @@ -0,0 +1,378 @@ +/** + * \file dnn/src/common/conv_bias.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +namespace megdnn { + +void ConvBiasForward::deduce_dtype(DType src, DType filter, DType /* bias */, + DType /* z */, DType& dst) { + check_or_deduce_dtype_fwd(src, filter, dst); +} + +void ConvBiasForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& /* bias */, + const TensorLayout& /* z */, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); +} + +ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_in_bytes) { + if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD || + param().format == param::ConvBias::Format::NCHW88_WINOGRAD) && + src.dtype.category() == DTypeCategory::QUANTIZED) { + megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16); + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || + src.dtype.enumv() == DTypeEnum::Quantized8Asymm); + } else { + megdnn_assert(src.dtype.enumv() == filter.dtype.enumv()); + } + if (src.dtype.enumv() == DTypeEnum::QuantizedS8) { + float scale_src = src.dtype.param().scale; + float scale_filter = 0.f; + if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || + param().format == param::ConvBias::Format::NCHW88_WINOGRAD) { + scale_filter = filter.dtype.param().scale; + } else { + scale_filter = filter.dtype.param().scale; + } + float scale_bias = bias.dtype.param().scale; + megdnn_assert(std::abs(scale_src * scale_filter - scale_bias) < 1e-6, + "scale_src: %f scale_filter: %f scale_bias: %f", + scale_src, scale_filter, scale_bias); + } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) { + float scale_src = src.dtype.param().scale; + float scale_filter = 0.f; + if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || + param().format == param::ConvBias::Format::NCHW88_WINOGRAD) { + scale_filter = filter.dtype.param().scale; + } else { + scale_filter = filter.dtype.param().scale; + } + float scale_bias = bias.dtype.param().scale; + megdnn_assert(std::abs(scale_src * scale_filter - scale_bias) < 1e-6, + "scale_src: %f scale_filter: %f scale_bias: %f", + scale_src, scale_filter, scale_bias); + } + + auto ret = check_layout_fwd(src, filter, dst); + megdnn_assert_contiguous(bias); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, filter, bias, z, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + if (bias.ndim != 0) { + //! bias.layout == dst.layout failed, no assert information + auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) { + if (dst.dtype.category() == DTypeCategory::QUANTIZED) { + return bias.eq_shape(dst); + } else { + return bias.eq_layout(dst); + } + }; + if (check_eq(bias, dst)) + return ret; + if (param().format == param::ConvBias::Format::NCHW || + param().format == param::ConvBias::Format::NCHW_WINOGRAD) { + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == 1); + } else if (param().format == param::ConvBias::Format::NHWC) { + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == 1); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == dst.shape[3], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + } else if (param().format == param::ConvBias::Format::NCHW4) { + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == 1); + megdnn_assert(bias.shape[4] == 4); + } else if (param().format == param::ConvBias::Format::NCHW8 || + param().format == param::ConvBias::Format::NCHW88 || + param().format == param::ConvBias::Format::NCHW88_WINOGRAD) { + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == 1); + megdnn_assert(bias.shape[4] == 8); + } else if (param().format == param::ConvBias::Format::NCHW32) { + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == 1); + megdnn_assert(bias.shape[4] == 32); + } else if (param().format == param::ConvBias::Format::CHWN4) { + megdnn_assert(bias.shape[0] == dst.shape[0], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[1] == 1); + megdnn_assert(bias.shape[2] == 1); + megdnn_assert(bias.shape[3] == 1); + megdnn_assert(bias.shape[4] == 4); + } else { + megdnn_assert(param().format == param::ConvBias::Format::NHWCD4); + megdnn_assert(bias.shape[0] == 1); + megdnn_assert(bias.shape[1] == 1); + megdnn_assert(bias.shape[2] == dst.shape[2], "bias:%s, dst:%s", + bias.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(bias.shape[3] == 1); + megdnn_assert(bias.shape[4] == 4); + } + } + + if (z.ndim != 0) { + megdnn_assert(param().format != param::ConvBias::Format::NCHW_WINOGRAD); + megdnn_assert(param().format != param::ConvBias::Format::NCHW88_WINOGRAD); + megdnn_assert(z.dtype.enumv() == dst.dtype.enumv()); + megdnn_assert(z.eq_shape(dst)); + } + return ret; +} + +template +struct ParamTrait; + +std::string ConvBias::WinogradParam::to_string() const { + return ssprintf("%u:%u:%u", channel_block_size, output_block_size, + tile_size); +} + +template +std::string ConvBias::algo_name(const std::string& base, const T& p) { + return ssprintf("%s:%s:%s", ParamTrait::category.c_str(), base.c_str(), + p.to_string().c_str()); +} + +#define FOREACH_CONV_BIAS_PARAM(cb) \ + cb(WinogradParam) \ + cb(DirectParam) \ + cb(MatmulParam) \ + cb(DefaultParam) + +#define cb(pt) \ + template <> \ + struct ParamTrait { \ + static const std::string category; \ + }; +FOREACH_CONV_BIAS_PARAM(cb) +#undef cb + +#define cb(pt, ct) const std::string ParamTrait::category = ct +cb(WinogradParam, "WINOGRAD"); +cb(DirectParam, "DIRECT"); +cb(MatmulParam, "MATMUL"); +cb(DefaultParam, "DEFAULT"); +#undef cb + +#define cb(t) \ + template std::string ConvBias::algo_name( \ + const std::string& base, const ConvBias::t& p); +FOREACH_CONV_BIAS_PARAM(cb) +#undef cb + +ConvBias::WinogradParam ConvBias::parse_winograd_name( + const std::string& algo_name) { + ConvBias::WinogradParam ret = INVALID_WINOGRAD_PARAM; + char base[128]; + sscanf(algo_name.c_str(), "WINOGRAD:%[^:]:%u:%u:%u", base, + &(ret.channel_block_size), &(ret.output_block_size), + &(ret.tile_size)); + if (ret.tile_size == 0 || ret.output_block_size == 0 || + ret.channel_block_size == 0) { + megdnn_log_warn("the algo name %s is not suitable for winograd", + algo_name.c_str()); + return INVALID_WINOGRAD_PARAM; + } + return ret; +} +constexpr ConvBias::WinogradParam ConvBias::INVALID_WINOGRAD_PARAM; + +void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, + const TensorND* conv_dst_tensor, + const TensorND* dst_tensor, + const TensorND* bias_tensor) { + using NonlineMode = param::ConvBias::NonlineMode; + switch (args.nonlineMode) { +#define cb(_mode) \ + case NonlineMode::_mode: { \ + if (conv_dst_tensor->layout.dtype.category() != \ + DTypeCategory::QUANTIZED) { \ + auto nonlinear = handle->create_operator(); \ + if (bias_tensor->layout.ndim > 0) { \ + nonlinear->param().mode = \ + Elemwise::Param::Mode::FUSE_ADD_##_mode; \ + nonlinear->exec({*conv_dst_tensor, *bias_tensor}, \ + *dst_tensor); \ + } else { \ + nonlinear->param().mode = Elemwise::Param::Mode::_mode; \ + nonlinear->exec({*conv_dst_tensor}, *dst_tensor); \ + } \ + } else { \ + auto nonlinear = handle->create_operator(); \ + if (bias_tensor->layout.ndim > 0) { \ + nonlinear->param().mode = \ + ElemwiseMultiType::Param::Mode::QFUSE_ADD_##_mode; \ + nonlinear->exec({*conv_dst_tensor, *bias_tensor}, \ + *dst_tensor); \ + } else { \ + nonlinear->param().mode = \ + ElemwiseMultiType::Param::Mode::Q##_mode; \ + nonlinear->exec({*conv_dst_tensor}, *dst_tensor); \ + } \ + } \ + break; \ + } + cb(RELU); + cb(H_SWISH); +#undef cb + case NonlineMode::SIGMOID: { + megdnn_assert(conv_dst_tensor->layout.dtype.category() != + DTypeCategory::QUANTIZED); + auto nonlinear = handle->create_operator(); + if (bias_tensor->layout.ndim > 0) { + nonlinear->param().mode = + Elemwise::Param::Mode::FUSE_ADD_SIGMOID; + nonlinear->exec({*conv_dst_tensor, *bias_tensor}, + *conv_dst_tensor); + } else { + nonlinear->param().mode = Elemwise::Param::Mode::SIGMOID; + nonlinear->exec({*conv_dst_tensor}, *conv_dst_tensor); + } + break; + } + case NonlineMode::IDENTITY: { + if (bias_tensor->layout.ndim > 0) { + if (dst_tensor->layout.dtype.category() == + DTypeCategory::QUANTIZED) { + auto nonlinear = + handle->create_operator(); + nonlinear->param().mode = + ElemwiseMultiType::Param::Mode::QADD; + nonlinear->exec({*conv_dst_tensor, *bias_tensor}, + *dst_tensor); + } else { + auto nonlinear = handle->create_operator(); + nonlinear->param().mode = Elemwise::Param::Mode::ADD; + nonlinear->exec({*conv_dst_tensor, *bias_tensor}, + *dst_tensor); + } + } else { + if (conv_dst_tensor->layout.dtype != dst_tensor->layout.dtype) { + handle->create_operator()->exec({*conv_dst_tensor}, + *dst_tensor); + } + } + break; + } + default: + megdnn_assert(false); + } +} + +//! Only used for naive implementation. DO NOT use the following function in +//! other backends. +void handle_z_inp_and_activation(Handle* handle, + param::ConvBias::NonlineMode nonline_mode, + const TensorND& conv_bias_tensor, + const TensorND& z_tensor, + const TensorND& dst_tensor, + dt_byte* workspace_ptr) { + auto res = dst_tensor, z_float = z_tensor; + if (z_tensor.layout.ndim > 0 && + z_tensor.layout.dtype.category() != DTypeCategory::FLOAT) { + dt_byte *res_float_workspace_ptr = nullptr, + *z_float_workspace_ptr = nullptr; + megdnn_assert(z_tensor.layout.eq_shape(dst_tensor.layout)); + res_float_workspace_ptr = workspace_ptr; + z_float_workspace_ptr = res_float_workspace_ptr + + TensorLayout{z_tensor.layout, dtype::Float32()} + .span() + .dist_byte(); + res = TensorND{res_float_workspace_ptr, + TensorLayout{dst_tensor.layout, dtype::Float32()}}; + z_float = TensorND{z_float_workspace_ptr, + TensorLayout{z_tensor.layout, dtype::Float32()}}; + } + // ====================sfb + z_tensor===================== + if (z_tensor.layout.ndim > 0) { + if (z_tensor.layout.dtype.category() != DTypeCategory::FLOAT) { + auto&& type_cvt = handle->create_operator(); + type_cvt->exec(conv_bias_tensor, res); + type_cvt->exec(z_tensor, z_float); + } + auto add_opr = handle->create_operator(); + add_opr->param().mode = Elemwise::Param::Mode::ADD; + add_opr->exec({res, z_float}, res); + } else { + res = conv_bias_tensor; + } + + using NonlineMode = param::ConvBias::NonlineMode; + + switch (nonline_mode) { +#define cb(_mode) \ + case NonlineMode::_mode: { \ + if (res.layout.dtype.category() != DTypeCategory::QUANTIZED) { \ + auto nonlinear = handle->create_operator(); \ + nonlinear->param().mode = Elemwise::Param::Mode::_mode; \ + if (res.layout.dtype == dst_tensor.layout.dtype) { \ + nonlinear->exec({res}, dst_tensor); \ + } else { \ + nonlinear->exec({res}, res); \ + handle->create_operator()->exec(res, dst_tensor); \ + } \ + } else { \ + auto nonlinear = handle->create_operator(); \ + nonlinear->param().mode = \ + ElemwiseMultiType::Param::Mode::Q##_mode; \ + nonlinear->exec({res}, dst_tensor); \ + } \ + break; \ + } + cb(RELU); + cb(H_SWISH); +#undef cb + case NonlineMode::SIGMOID: { + megdnn_assert(res.layout.dtype.category() != + DTypeCategory::QUANTIZED); + auto nonlinear = handle->create_operator(); + nonlinear->param().mode = Elemwise::Param::Mode::SIGMOID; + nonlinear->exec({res}, res); + if (res.raw_ptr != dst_tensor.raw_ptr) { + handle->create_operator()->exec(res, dst_tensor); + } + break; + } + case NonlineMode::IDENTITY: { + if (res.raw_ptr != dst_tensor.raw_ptr) { + handle->create_operator()->exec(res, dst_tensor); + } + break; + } + default: + megdnn_assert(false); + } +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/conv_bias.h b/dnn/src/common/conv_bias.h new file mode 100644 index 00000000..01810b27 --- /dev/null +++ b/dnn/src/common/conv_bias.h @@ -0,0 +1,34 @@ +/** + * \file dnn/src/common/conv_bias.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/handle.h" +#include "megdnn/opr_param_defs.h" +#include "megdnn/oprs/general.h" +#include "megdnn/oprs/nn_int.h" +#include "src/common/utils.h" + +namespace megdnn { + +void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, + const TensorND* conv_dst_tensor, + const TensorND* dst_tensor, + const TensorND* bias_tensor); + +void handle_z_inp_and_activation(Handle* handle, + param::ConvBias::NonlineMode nonline_mode, + const TensorND& conv_bias_tensor, + const TensorND& z_tensor, + const TensorND& dst_tensor, + dt_byte* workspace_ptr); + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/conv_pooling.cpp b/dnn/src/common/conv_pooling.cpp new file mode 100644 index 00000000..ce2a3985 --- /dev/null +++ b/dnn/src/common/conv_pooling.cpp @@ -0,0 +1,17 @@ +/** + * \file dnn/src/common/conv_pooling.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn.h" +#include "src/common/utils.h" + +namespace megdnn { + + +} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp new file mode 100644 index 00000000..d8ebfea4 --- /dev/null +++ b/dnn/src/common/convolution.cpp @@ -0,0 +1,1063 @@ +/** + * \file dnn/src/common/convolution.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +using namespace megdnn; + +namespace { +template +std::string get_errmsg(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, const Param& param) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + MEGDNN_MARK_USED_VAR(dst); + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(filter) + ", " + + megdnn_layout_msg(dst) + ", " + megdnn_mangle("is_nchw=") + + std::to_string(param.format == param::Convolution::Format::NCHW) + + ", " + +megdnn_mangle("is_xcorr=") + + std::to_string( + (param.mode == Convolution::Mode::CROSS_CORRELATION)) + + ", " + megdnn_mangle("pad_h=") + std::to_string(param.pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param.pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param.stride_h) + ", " + + megdnn_mangle("stride_w=") + std::to_string(param.stride_w) + ", " + + megdnn_mangle("dilate_h=") + std::to_string(param.dilate_h) + ", " + + megdnn_mangle("dilate_w=") + std::to_string(param.dilate_w); +} + +template +uint32_t spatial_getter(uint32_t filter, const Param&) { + return filter; +} + +template <> +uint32_t +spatial_getter( + uint32_t filter, const param::ConvBias& param) { + //! f = m + r - 1 -> r = f + 1 - m + return filter - param.output_block_size + 1; +} + +template <> +uint32_t +spatial_getter( + uint32_t filter, const param::ConvBias& param) { + //! f = m + r - 1 -> r = f + 1 - m + return filter - param.output_block_size + 1; +} + + +template +void make_canonized_filter_meta_nchw_nhwc( + size_t src_ndim, const TensorLayout& filter, const Param& param, + typename ConvolutionBase::CanonizedFilterMeta& ret) { + megdnn_assert(param.format == Param::Format::NCHW || + param.format == Param::Format::NHWC || + param.format == Param::Format::NCHW_WINOGRAD); + auto img_ndim = src_ndim - 2; + size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; + if (param.sparse == Param::Sparse::DENSE) { + megdnn_assert( + filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + ret.group = 1; + flt_start = 0; + } else { + megdnn_assert(param.sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + megdnn_assert( + filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + + // grp, oc, ic, dims[] + ret.group = filter[0]; + flt_start = 1; + } + + uint32_t ic_block_size = 1, oc_block_size = 1; + if (param.format == Param::Format::NCHW) { + // filter should be (oc, ic, fh, fw) + flt_spatial_start = 2; + ocpg_pos = 0; + icpg_pos = 1; + } else if (param.format == Param::Format::NCHW_WINOGRAD) { + // filter should be (alphah, alphaw, ic, oc) or (alphah, alphaw, ocb, + // icb, ic_block_size, oc_block_size) + flt_spatial_start = 0; + if (filter.ndim == flt_start + 4) { + ocpg_pos = 3; + icpg_pos = 2; + } else { + megdnn_assert(filter.ndim == flt_start + 6); + ic_block_size = filter[flt_start + 4]; + oc_block_size = filter[flt_start + 5]; + ocpg_pos = 2; + icpg_pos = 3; + } + } else { + megdnn_assert(param.format == Param::Format::NHWC, + "invalid conv tensor format"); + // filter should be (oc, fh, fw, ic) + flt_spatial_start = 1; + ocpg_pos = 0; + icpg_pos = 3; + } + ret.spatial_ndim = src_ndim - 2; + megdnn_assert( + ret.spatial_ndim == 2, + "only 2D convolution is supported, and input should be 4-dim; " + "got input dim = %zu", + src_ndim); + ret.ocpg = filter[flt_start + ocpg_pos] * oc_block_size; + ret.icpg = filter[flt_start + icpg_pos] * ic_block_size; + auto dilation = ret.dilation; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] > 0, + "invalid dilation on spatial dim %zu: %u", i, + dilation[i]); + if (param.format == Param::Format::NCHW_WINOGRAD) { + ret.spatial[i] = + spatial_getter( + filter[i + flt_start + flt_spatial_start], param); + } else { + ret.spatial[i] = spatial_getter( + filter[i + flt_start + flt_spatial_start], param); + } + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +template +void make_canonized_filter_meta_nhwcd4( + size_t src_ndim, const TensorLayout& filter, const Param& param, + typename ConvolutionBase::CanonizedFilterMeta& ret) { + /** + * input: N H IC/4 W 4 + * Filter: + * OC/4, FH, FW, IC, 4 [dense] + * GROUP, OC/4, FH, FW, IC, 4 [group] + * GROUP/4, 1, FH, FW, 4 [chanwise] + */ + megdnn_assert(param.format == Param::Format::NHWCD4); + auto img_ndim = src_ndim - 3; + size_t flt_start = 0, flt_spatial_start = 1; + bool is_chanwise = false; + if (param.sparse == Param::Sparse::DENSE) { + megdnn_assert(filter.ndim == img_ndim + 3, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + // oc, ic, dims[] + ret.group = 1; + flt_start = 0; + } else { + megdnn_assert(param.sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + megdnn_assert( + filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 4, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + if (filter.ndim == img_ndim + 3 && filter[1] == 1) { + is_chanwise = true; + ret.group = filter[0] * 4; + } else { + ret.group = filter[0]; + } + flt_start = 1; + } + ret.spatial_ndim = src_ndim - 3; + megdnn_assert( + ret.spatial_ndim == 2, + "only 2D convolution is supported, and input should be 4-dim; " + "got input dim = %zu", + src_ndim); + if (is_chanwise) { + ret.ocpg = 1; + ret.icpg = 1; + } else { + ret.ocpg = filter[flt_start] * 4; + ret.icpg = filter[flt_start + 3]; + } + auto dilation = ret.dilation; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] > 0, + "invalid dilation on spatial dim %zu: %u", i, + dilation[i]); + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +template +void make_canonized_filter_meta_nhwcd4_dot( + size_t src_ndim, const TensorLayout& filter, const Param& param, + typename ConvolutionBase::CanonizedFilterMeta& ret) { + /** + * input: N H IC/4 W 4 + * Filter: + * GROUP/4, 1, FH, FW, 4 [chanwise] + * OC/4, FH, FW, IC/4, 4, 4 [dense] + * GROUP, OC/4, FH, FW, IC/4, 4, 4 [group] + */ + megdnn_assert(param.format == Param::Format::NHWCD4); + auto img_ndim = src_ndim - 3; + size_t flt_start = 0, flt_spatial_start = 1; + bool is_chanwise = false; + if (param.sparse == Param::Sparse::DENSE) { + megdnn_assert(filter.ndim == img_ndim + 4, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + // oc, ic, dims[] + ret.group = 1; + flt_start = 0; + } else { + megdnn_assert(param.sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + megdnn_assert( + filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + if (filter.ndim == img_ndim + 3) { + megdnn_assert(filter[1] == 1); + is_chanwise = true; + ret.group = filter[0] * 4; + } else { + ret.group = filter[0]; + } + flt_start = 1; + } + ret.spatial_ndim = src_ndim - 3; + megdnn_assert( + ret.spatial_ndim == 2, + "only 2D convolution is supported, and input should be 4-dim; " + "got input dim = %zu", + src_ndim); + if (is_chanwise) { + ret.ocpg = 1; + ret.icpg = 1; + } else { + ret.ocpg = filter[flt_start] * 4; + ret.icpg = filter[flt_start + 3] * 4; + } + auto dilation = ret.dilation; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] > 0, + "invalid dilation on spatial dim %zu: %u", i, + dilation[i]); + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +template +void make_canonized_filter_meta_nchwxx( + size_t src_ndim, const TensorLayout& filter, const Param& param, + typename ConvolutionBase::CanonizedFilterMeta& ret) { + /** + * input: N IC/pack_size, H, W, pack_size + * + * NCHW88 mode + * filter: + * {OC/pack_size, IC/pack_size, FH, FW, pack_size(IC), pack_size(OC)} + * [dense] + * {GROUP, OC_PER_GROUP/pack_size, IC_PER_GROUP/pack_size, \ + * FH, FW, pack_size(IC), pack_size(OC)} [group] + * {GROUP/pack_size, 1, 1, FH, FW, pack_size} [chan] + * + ** NCHW88_WINOGRAD mode + * filter: + * {alpha, alpha, OC/pack_size, IC/pack_size, pack_size(IC), + *pack_size(OC)} [dense] + * {GROUP, alpha, alpha, OC_PER_GROUP/pack_size, + * IC_PER_GROUP/pack_size, pack_size(IC), pack_size(OC)} [group] + * + */ + + megdnn_assert(param.format == Param::Format::NCHW88 || + param.format == Param::Format::NCHW88_WINOGRAD); + size_t img_ndim = 2; + size_t flt_start = 0; + size_t flt_spatial_start = 2; + if (param.sparse == Param::Sparse::DENSE) { + if (filter.ndim == img_ndim + 4) { + // oihw8i8o case + megdnn_assert(filter[filter.ndim - 2] == pack_size && + filter[filter.ndim - 1] == pack_size, + "last 2 dim of filter must be %zu, but got %zu, %zu", + pack_size, filter[filter.ndim - 2], + filter[filter.ndim - 1]); + ret.group = 1; + flt_start = 0; + if (param.format == Param::Format::NCHW88_WINOGRAD) { + flt_start = 2; + } + ret.ocpg = filter[flt_start] * pack_size; + ret.icpg = filter[flt_start + 1] * pack_size; + } else if (filter.ndim == img_ndim + 3) { + // ohwi8o + megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD, + "Hybrid nchw88 mode in not support winograd"); + flt_start = 0; + flt_spatial_start = 1; + ret.group = 1; + ret.ocpg = filter[flt_start] * pack_size; + ret.icpg = filter[flt_start + 3]; + + } else { + megdnn_assert(0, "not support nchw88 filter dim = %zu", + filter.ndim); + } + } else { + megdnn_assert(param.sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + flt_start = 1; + if (param.format == Param::Format::NCHW88_WINOGRAD) { + flt_start = 3; + } + auto filter_oc = filter[flt_start]; + auto filter_ic = filter[flt_start + 1]; + if (filter_oc == 1 && filter_ic == 1 && filter.ndim == (img_ndim + 4) && + param.format != Param::Format::NCHW88_WINOGRAD) { + // Depthwise case goihw8g + megdnn_assert(filter.ndim == img_ndim + 4, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + megdnn_assert(filter[filter.ndim - 1] == pack_size, + "last dim of filter must be %zu, but %zu", pack_size, + filter[filter.ndim - 1]); + ret.group = filter[0] * 8; + ret.ocpg = filter_oc; + ret.icpg = filter_ic; + + } else { + // norm group case goihw8i8o + megdnn_assert(filter.ndim == img_ndim + 5, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + megdnn_assert(filter[filter.ndim - 1] == pack_size && + filter[filter.ndim - 2] == pack_size, + "last 2 dim of filter must be %zu, but got %zu, %zu", + pack_size, filter[filter.ndim - 2], + filter[filter.ndim - 1]); + + ret.group = filter[0]; + ret.ocpg = filter_oc * pack_size; + ret.icpg = filter_ic * pack_size; + } + } + ret.spatial_ndim = 2; + megdnn_assert(ret.spatial_ndim == 2, + "only 2D convolution is supported, and input should be 5-dim " + "for nchwxx; " + "got input dim = %zu", + src_ndim); + + auto dilation = ret.dilation; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] == 1, + "NCHWXX has invalid dilation on spatial dim %zu: %u, " + "require to be 1", + i, dilation[i]); + if (param.format == Param::Format::NCHW88_WINOGRAD) { + ret.spatial[i] = + spatial_getter( + filter[i + flt_start - 2], param); + } else { + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + } + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +template +void make_canonized_filter_meta_nchwx( + size_t src_ndim, const TensorLayout& filter, const Param& param, + typename ConvolutionBase::CanonizedFilterMeta& ret) { + /** + * input: N IC/pack_size, H, W, pack_size + * filter: + * OC, IC/pack_size, FH, FW, pack_size [dense] + * GROUP, OC, IC/pack_size, FH, FW, pack_size [group] + */ + megdnn_assert(param.format == Param::Format::NCHW4 || + param.format == Param::Format::NCHW8 || + param.format == Param::Format::NCHW32); + auto img_ndim = src_ndim - 3; + size_t flt_start = 0, flt_spatial_start = 2; + if (param.sparse == Param::Sparse::DENSE) { + megdnn_assert(filter.ndim == img_ndim + 3, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + // oc, ic, dims[] + ret.group = 1; + flt_start = 0; + } else { + megdnn_assert(param.sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + megdnn_assert(filter.ndim == img_ndim + 4, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + ret.group = filter[0]; + flt_start = 1; + } + ret.spatial_ndim = src_ndim - 3; + megdnn_assert(ret.spatial_ndim == 2, + "only 2D convolution is supported, and input should be 5-dim " + "for nchw4; " + "got input dim = %zu", + src_ndim); + ret.ocpg = filter[flt_start]; + ret.icpg = filter[flt_start + 1] * pack_size; + auto dilation = ret.dilation; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] == 1, + "NCHW4 has invalid dilation on spatial dim %zu: %u, " + "require to be 1", + i, dilation[i]); + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +template +void make_canonized_filter_meta_chwnx( + size_t src_ndim, const TensorLayout& filter, const Param& param, + typename ConvolutionBase::CanonizedFilterMeta& ret) { + /** + * input: IC / pack_size, H, W, N, pack_size + * Filter: + * IC / pack_size, FH, FW, OC, pack_size [dense] + * GROUP, icpg / pack_size, FH, FW, ocpg, pack_size [group] + * not implemented [chanwise] + */ + megdnn_assert(param.format == Param::Format::CHWN4); + auto img_ndim = src_ndim - 3; + size_t flt_start = 0, flt_spatial_start = 1; + if (param.sparse == Param::Sparse::DENSE) { + megdnn_assert(filter.ndim == img_ndim + 3, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + // oc, ic, dims[] + ret.group = 1; + flt_start = 0; + } else { + megdnn_assert(param.sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + megdnn_assert(filter.ndim == img_ndim + 4, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + ret.group = filter[0]; + flt_start = 1; + } + ret.spatial_ndim = src_ndim - 3; + megdnn_assert( + ret.spatial_ndim == 2, + "only 2D convolution is supported, and input should be 4-dim; " + "got input dim = %zu", + src_ndim); + ret.icpg = filter[flt_start] * pack_size; + ret.ocpg = filter[flt_start + 3]; + auto dilation = ret.dilation; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] == 1, + "CHWNx has invalid dilation on spatial dim %zu: %u, " + "require to be 1", + i, dilation[i]); + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +} // namespace + +namespace megdnn { +template +typename ConvolutionBase::CanonizedFilterMeta +ConvolutionBase::make_canonized_filter_meta( + size_t src_ndim, const TensorLayout& filter) const { + megdnn_assert_contiguous(filter); + CanonizedFilterMeta ret; + ret.dtype = filter.dtype; + ret.format = param().format; + if (param().mode == Mode::CONVOLUTION) { + ret.should_flip = true; + } else { + megdnn_assert(param().mode == Mode::CROSS_CORRELATION, + "invalid conv mode"); + ret.should_flip = false; + } + ret.stride[0] = param().stride_h; + ret.stride[1] = param().stride_w; + ret.padding[0] = param().pad_h; + ret.padding[1] = param().pad_w; + ret.dilation[0] = param().dilate_h; + ret.dilation[1] = param().dilate_w; + + if (param().format == Param::Format::NHWCD4) { + if (filter.dtype.enumv() == DTypeEnum::QuantizedS8 || + filter.dtype.enumv() == DTypeEnum::Quantized8Asymm) { + make_canonized_filter_meta_nhwcd4_dot(src_ndim, filter, + param(), ret); + } else { + make_canonized_filter_meta_nhwcd4(src_ndim, filter, + param(), ret); + } + } else if (param().format == Param::Format::NCHW4) { + make_canonized_filter_meta_nchwx<4, Parameter>(src_ndim, filter, + param(), ret); + } else if (param().format == Param::Format::NCHW8) { + make_canonized_filter_meta_nchwx<8, Parameter>(src_ndim, filter, + param(), ret); + } else if (param().format == Param::Format::NCHW88 || + param().format == Param::Format::NCHW88_WINOGRAD) { + make_canonized_filter_meta_nchwxx<8, Parameter>(src_ndim, filter, + param(), ret); + } else if (param().format == Param::Format::NCHW32) { + make_canonized_filter_meta_nchwx<32, Parameter>(src_ndim, filter, + param(), ret); + } else if (param().format == Param::Format::CHWN4) { + make_canonized_filter_meta_chwnx<4, Parameter>(src_ndim, filter, + param(), ret); + } else { + megdnn_assert(param().format == Param::Format::NHWC || + param().format == Param::Format::NCHW || + param().format == Param::Format::NCHW_WINOGRAD); + make_canonized_filter_meta_nchw_nhwc(src_ndim, filter, + param(), ret); + } + return ret; +} + +template +void ConvolutionBase::check_or_deduce_dtype_fwd(DType src, + DType filter, + DType& dst) const { + // The first one will be the default choice. + SmallVector supported_dst_dtype; + // We rely on megdnn_assert(src.enumv() == filter.enumv()) here. + if (src.category() == DTypeCategory::FLOAT) { + supported_dst_dtype.push_back(src); + } else if (src.enumv() == DTypeEnum::Int8) { + supported_dst_dtype = {dtype::Int32(), dtype::Int16()}; + } else if (src.enumv() == DTypeEnum::QuantizedS8 || + src.enumv() == DTypeEnum::Quantized8Asymm || + src.enumv() == DTypeEnum::Quantized4Asymm) { + supported_dst_dtype.push_back( + dtype::QuantizedS32(mul_scale(src, filter))); + if (dst.valid() && dst.enumv() == src.enumv()) { + supported_dst_dtype.push_back(dst); + } + } else if (src.enumv() == DTypeEnum::QuantizedS32) { + //! ConvolutionBackwardData: s8(filter) + s8(dst) -> s32(src) + megdnn_assert(filter.enumv() == DTypeEnum::QuantizedS8); + supported_dst_dtype.push_back( + dtype::QuantizedS8(src.param().scale / + filter.param().scale)); + } else { + megdnn_throw(ssprintf("unsupported input / filter DType: %s x %s", + src.name(), filter.name())); + } + if (!dst.valid()) { + dst = supported_dst_dtype.at(0); + } else { + megdnn_assert(vec_contains(supported_dst_dtype, dst), + "unsupported Conv(%s, %s) -> %s", src.name(), + filter.name(), dst.name()); + } + megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 +#if !MEGDNN_DISABLE_FLOAT16 + || src.enumv() == DTypeEnum::Float16 +#endif + , + "ComputeMode::FLOAT32 is only available for Float16 " + "input / output."); +} + +template +typename ConvolutionBase::CanonizedFilterMeta +ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) const { + auto errmsg = [&]() { return get_errmsg(src, filter, dst, param()); }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); + megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); + if (param().format == Param::Format::NCHW_WINOGRAD && + src.dtype.category() == DTypeCategory::QUANTIZED) { + megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16, "%s", + errmsg().c_str()); + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || + src.dtype.enumv() == DTypeEnum::Quantized8Asymm, + "%s", errmsg().c_str()); + } else { + megdnn_assert(src.dtype.enumv() == filter.dtype.enumv(), "%s", + errmsg().c_str()); + } + check_or_deduce_dtype_fwd(src.dtype, filter.dtype, dst.dtype); + size_t img_dim; + if (param().format == Param::Format::NCHW || + param().format == Param::Format::NHWC || + param().format == Param::Format::NCHW_WINOGRAD) { + img_dim = src.ndim - 2; + megdnn_assert(filter.ndim >= img_dim + 2 && filter.ndim <= img_dim + 6, + "%s", errmsg().c_str()); + + } else { + megdnn_assert(param().format == Param::Format::NHWCD4 || + param().format == Param::Format::NCHW4 || + param().format == Param::Format::NCHW8 || + param().format == Param::Format::NCHW32 || + param().format == Param::Format::NCHW88 || + param().format == Param::Format::NCHW88_WINOGRAD || + param().format == Param::Format::CHWN4); + img_dim = src.ndim - 3; + if (param().format == Param::Format::NCHW88 && filter.ndim == 5) { + img_dim = src.ndim - 2; + } + megdnn_assert(filter.ndim == img_dim + 3 || + (filter.ndim == img_dim + 2 && + param().format == Param::Format::NCHW88) || + filter.ndim == img_dim + 4 || + filter.ndim == img_dim + 5, + "%s", errmsg().c_str()); + if (param().format == Param::Format::NCHW4) { + megdnn_assert(src.ndim == 5 && + (filter.ndim == 5 || filter.ndim == 6 || + filter.ndim == 7) && + src[src.ndim - 1] == 4 && + filter[filter.ndim - 1] == 4, + "NCHW4 require src and filter's ndim is 5 or 6, and " + "last shape " + "is 4 " + "but got src %s, filter %s", + src.to_string().c_str(), filter.to_string().c_str()); + } + if (param().format == Param::Format::NCHW8) { + megdnn_assert( + src.ndim == 5 && (filter.ndim == 5 || filter.ndim == 6) && + src[src.ndim - 1] == 8 && + filter[filter.ndim - 1] == 8, + "NCHW8 require src and filter's ndim is 5 or 6, and last " + "shape is 8 " + "but got src %s, filter %s", + src.to_string().c_str(), filter.to_string().c_str()); + } + if (param().format == Param::Format::NCHW32) { + megdnn_assert( + src.ndim == 5 && (filter.ndim == 5 || filter.ndim == 6) && + src[src.ndim - 1] == 32 && + filter[filter.ndim - 1] == 32, + "NCHW32 require src and filter's ndim is 5 or 6, and last " + "shape is 32 " + "but got src %s, filter %s", + src.to_string().c_str(), filter.to_string().c_str()); + } + if (param().format == Param::Format::NCHW88 || + param().format == Param::Format::NCHW88_WINOGRAD) { + megdnn_assert((src.ndim == 4 && filter.ndim == 5 && + filter[filter.ndim - 1] == 8) || + (src.ndim == 5 && + ((filter.ndim == 6 && + filter[filter.ndim - 1] == 8) || + (filter.ndim == 7 && + filter[filter.ndim - 1] == 8 && + filter[filter.ndim - 2] == 8)) && + src[src.ndim - 1] == 8), + "NCHW88 require src ndim is 5 and filter's ndim is 6 " + ", and last shape two is 8 but got src %s, filter %s", + src.to_string().c_str(), filter.to_string().c_str()); + } + if (param().format == Param::Format::CHWN4) { + megdnn_assert( + src.ndim == 5 && (filter.ndim == 5 || filter.ndim == 6) && + src[src.ndim - 1] == 4 && + filter[filter.ndim - 1] == 4, + "CHWN4 require src and filter's ndim is 5 or 6, and last " + "shape is 4 " + "but got src %s, filter %s", + src.to_string().c_str(), filter.to_string().c_str()); + } + } + megdnn_assert(img_dim == 2, + "currently only convolution on 2D image is supported"); + auto cflt = make_canonized_filter_meta(src.ndim, filter); + if (param().format == Param::Format::NCHW || + param().format == Param::Format::NHWC || + param().format == Param::Format::NCHW_WINOGRAD) { + size_t src_or_dst_c_pos = 0; + size_t src_or_dst_spatial_start = 0; + if (param().format == Param::Format::NCHW || + param().format == Param::Format::NCHW_WINOGRAD) { + src_or_dst_c_pos = 1; + src_or_dst_spatial_start = 2; + } else { + megdnn_assert(param().format == Param::Format::NHWC, + "invalid conv format"); + src_or_dst_c_pos = 3; + src_or_dst_spatial_start = 1; + } + megdnn_assert(cflt.icpg * cflt.group == src[src_or_dst_c_pos], "%s", + errmsg().c_str()); + if (param().format == Param::Format::NCHW_WINOGRAD) { + megdnn_assert(cflt.spatial[0] == cflt.spatial[1], + "NCHW_WINOGRAD only support conv with fh == fw"); + } + dst.ndim = src.ndim; + dst[0] = src[0]; + dst[src_or_dst_c_pos] = cflt.ocpg * cflt.group; + for (size_t i = 0; i < cflt.spatial_ndim; ++i) { + dst[i + src_or_dst_spatial_start] = infer_conv_shape( + src[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], + cflt.stride[i], cflt.padding[i]); + } + dst.init_contiguous_stride(); + } else if (param().format == Param::Format::NCHW4) { + megdnn_assert(src.ndim == 5, + "invalid src ndim for NCHW4, expected=5, got=%zu", + src.ndim); + megdnn_assert(cflt.icpg * cflt.group == src[1] * 4, + "%s icpg=%u group=%u", errmsg().c_str(), cflt.icpg, + cflt.group); + dst.ndim = src.ndim; + dst[0] = src[0]; + auto oc = cflt.ocpg * cflt.group; + megdnn_assert(oc % 4 == 0); + dst[1] = oc / 4; + dst[2] = infer_conv_shape(src[2], cflt.dilated_spatial[0], + cflt.stride[0], cflt.padding[0]); + dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], + cflt.stride[1], cflt.padding[1]); + dst[4] = 4; + } else if (param().format == Param::Format::NCHW8) { + megdnn_assert(src.ndim == 5, + "invalid src ndim for NCHW8, expected=5, got=%zu", + src.ndim); + megdnn_assert(cflt.icpg * cflt.group == src[1] * 8, + "%s icpg=%u group=%u", errmsg().c_str(), cflt.icpg, + cflt.group); + dst.ndim = src.ndim; + dst[0] = src[0]; + auto oc = cflt.ocpg * cflt.group; + megdnn_assert(oc % 8 == 0); + dst[1] = oc / 8; + dst[2] = infer_conv_shape(src[2], cflt.dilated_spatial[0], + cflt.stride[0], cflt.padding[0]); + dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], + cflt.stride[1], cflt.padding[1]); + dst[4] = 8; + } else if (param().format == Param::Format::NCHW32) { + megdnn_assert(src.ndim == 5, + "invalid src ndim for NCHW32, expected=5, got=%zu", + src.ndim); + megdnn_assert(cflt.icpg * cflt.group == src[1] * 32, + "%s icpg=%u group=%u", errmsg().c_str(), cflt.icpg, + cflt.group); + dst.ndim = src.ndim; + dst[0] = src[0]; + auto oc = cflt.ocpg * cflt.group; + megdnn_assert(oc % 32 == 0); + dst[1] = oc / 32; + dst[2] = infer_conv_shape(src[2], cflt.dilated_spatial[0], + cflt.stride[0], cflt.padding[0]); + dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], + cflt.stride[1], cflt.padding[1]); + dst[4] = 32; + } else if (param().format == Param::Format::NCHW88 || + param().format == Param::Format::NCHW88_WINOGRAD) { + megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), + "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", + src.ndim); + dst.ndim = 5; + dst[0] = src[0]; + auto oc = cflt.ocpg * cflt.group; + megdnn_assert(oc % 8 == 0); + dst[1] = oc / 8; + dst[2] = infer_conv_shape(src[2], cflt.dilated_spatial[0], + cflt.stride[0], cflt.padding[0]); + dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], + cflt.stride[1], cflt.padding[1]); + dst[4] = 8; + if (cflt.group == 1) { + megdnn_assert(cflt.icpg * cflt.group == src[1] * 8 || + (cflt.icpg * cflt.group == src[1]), + "%s icpg=%u group=%u", errmsg().c_str(), cflt.icpg, + cflt.group); + } + + } else if (param().format == Param::Format::CHWN4) { + megdnn_assert(src.ndim == 5, + "invalid src ndim for CHWN4, expected=5, got=%zu", + src.ndim); + megdnn_assert(cflt.icpg * cflt.group == src[0] * 4, + "%s icpg=%u group=%u", errmsg().c_str(), cflt.icpg, + cflt.group); + dst.ndim = src.ndim; + dst[3] = src[3]; + auto oc = cflt.ocpg * cflt.group; + megdnn_assert(oc % 4 == 0); + dst[0] = oc / 4; + dst[1] = infer_conv_shape(src[1], cflt.dilated_spatial[0], + cflt.stride[0], cflt.padding[0]); + dst[2] = infer_conv_shape(src[2], cflt.dilated_spatial[1], + cflt.stride[1], cflt.padding[1]); + dst[4] = 4; + } else { + megdnn_assert(param().format == Param::Format::NHWCD4); + megdnn_assert(src.ndim == 5, + "invalid src ndim for NHWCD4, expected=5, got=%zu", + src.ndim); + megdnn_assert(cflt.icpg * cflt.group == src[2] * 4, + "%s icpg=%u group=%u", errmsg().c_str(), cflt.icpg, + cflt.group); + dst.ndim = src.ndim; + dst[0] = src[0]; + auto oc = cflt.ocpg * cflt.group; + megdnn_assert(oc % 4 == 0); + dst[2] = oc / 4; + dst[1] = infer_conv_shape(src[1], cflt.dilated_spatial[0], + cflt.stride[0], cflt.padding[0]); + dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], + cflt.stride[1], cflt.padding[1]); + megdnn_assert(src[4] == 4); + dst[4] = 4; + } + dst.format = src.format; + dst.init_contiguous_stride(); + return cflt; +} + +/** + * \warning: An explicit specialization shall be declared in a namespace + * enclosing the specialized template. An explicit specialization whose + * declarator-id is not qualified shall be declared in the nearest enclosing + * namespace of the template, or, if the namespace is inline (7.3.1), any + * namespace from its enclosing namespace set. + * refer to: + * https://stackoverflow.com/questions/25594644/warning-specialization-of-template-in-different-namespace + */ +template <> +ConvolutionBase::CanonizedFilterMeta +ConvolutionBase::check_layout_fwd( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) const { + TensorLayout dst_expected; + dst_expected.dtype = dst.dtype; + + auto ret = deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + return ret; +} + +template <> +ConvolutionBase::CanonizedFilterMeta +ConvolutionBase::check_layout_fwd( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) const { + TensorLayout dst_expected; + dst_expected.dtype = dst.dtype; + + auto ret = deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + return ret; +} + +template <> +ConvolutionBase::CanonizedFilterMeta +ConvolutionBase::check_layout_fwd( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) const { + TensorLayout dst_expected; + dst_expected.dtype = dst.dtype; + + auto ret = deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + return ret; +} + +void ConvolutionForward::deduce_dtype(DType src, DType filter, DType& dst) { + check_or_deduce_dtype_fwd(src, filter, dst); +} + +void ConvolutionForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); +} + +ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_in_bytes) { + auto ret = check_layout_fwd(src, filter, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +ConvolutionBackwardData::CanonizedFilterMeta +ConvolutionBackwardData::check_exec(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes) { + auto grad_fwd = grad; + auto filter_fwd = filter; + auto diff_fwd = diff; + + std::swap(grad_fwd.dtype, diff_fwd.dtype); + + grad_fwd.init_contiguous_stride(); + diff_fwd.init_contiguous_stride(); + auto ret = check_layout_fwd(grad_fwd, filter_fwd, diff_fwd); + auto required_workspace_in_bytes = + get_workspace_in_bytes(filter, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, + DType& grad) { + SmallVector supported_dst_dtype; + if (filter.category() == diff.category() && + filter.category() == DTypeCategory::FLOAT) { + supported_dst_dtype.push_back(filter); + } else if (filter.enumv() == DTypeEnum::Int8 && diff == filter) { + supported_dst_dtype.push_back(dtype::Int32()); + } else if ((filter.enumv() == DTypeEnum::QuantizedS8 && + diff.enumv() == DTypeEnum::QuantizedS8) || + (filter.enumv() == DTypeEnum::Quantized8Asymm && + diff.enumv() == DTypeEnum::Quantized8Asymm)) { + supported_dst_dtype.push_back( + dtype::QuantizedS32(mul_scale(filter, diff))); + if (grad.valid() && grad.enumv() == diff.enumv()) { + supported_dst_dtype.push_back(grad); + } + } else { + megdnn_throw(ssprintf("unsupported input / diff DType: %s x %s", + filter.name(), diff.name())); + } + if (!grad.valid()) { + grad = supported_dst_dtype.at(0); + } else { + megdnn_assert(vec_contains(supported_dst_dtype, grad), + "unsupported ConvBwd(%s, %s) -> %s", filter.name(), + diff.name(), grad.name()); + } + megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 +#if !MEGDNN_DISABLE_FLOAT16 + || filter.enumv() == DTypeEnum::Float16 +#endif + , + "ComputeMode::FLOAT32 is only available for Float16 " + "input / output."); +} + +void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, + const TensorLayout& diff, + TensorLayout& grad) { + auto errmsg = [&]() { return get_errmsg(filter, diff, grad, param()); }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(filter); + megdnn_assert_contiguous(diff); + megdnn_assert(filter.ndim == 4_z || filter.ndim == 5_z, "%s", + errmsg().c_str()); + megdnn_assert(diff.ndim == 4_z || diff.ndim == 5_z, "%s", errmsg().c_str()); + + deduce_dtype(filter.dtype, diff.dtype, grad.dtype); + + auto cflt = make_canonized_filter_meta(diff.ndim, filter); + + auto deduce = [&errmsg](size_t out, size_t filter, size_t stride, + size_t pad) { + MEGDNN_MARK_USED_VAR(errmsg); + auto i = (out - 1) * stride + filter; + megdnn_assert(i > pad * 2, "%s", errmsg().c_str()); + return i - pad * 2; + }; + + if (param().format == Param::Format::NCHW || + param().format == Param::Format::NHWC) { + size_t src_or_dst_c_pos = 0; + size_t src_or_dst_spatial_start = 0; + if (param().format == Param::Format::NCHW) { + src_or_dst_c_pos = 1; + src_or_dst_spatial_start = 2; + } else { + megdnn_assert(param().format == Param::Format::NHWC, + "invalid conv format"); + src_or_dst_c_pos = 3; + src_or_dst_spatial_start = 1; + } + megdnn_assert(cflt.ocpg * cflt.group == diff[src_or_dst_c_pos], "%s", + errmsg().c_str()); + grad.ndim = diff.ndim; + grad[0] = diff[0]; + grad[src_or_dst_c_pos] = cflt.icpg * cflt.group; + for (size_t i = 0; i < cflt.spatial_ndim; ++i) { + grad[i + src_or_dst_spatial_start] = deduce( + diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], + cflt.stride[i], cflt.padding[i]); + } + } else { + megdnn_assert(param().format == Param::Format::NHWCD4); + megdnn_assert(diff.ndim == 5, + "valid diff ndim for NHWCD4, expected=5, got=%zu", + diff.ndim); + megdnn_assert(cflt.ocpg * cflt.group == diff[2] * 4, "%s", + errmsg().c_str()); + grad.ndim = diff.ndim; + grad[0] = diff[0]; + auto ic = cflt.icpg * cflt.group; + megdnn_assert(ic % 4 == 0); + grad[2] = ic / 4; + grad[1] = deduce(diff[1], cflt.dilated_spatial[0], cflt.stride[0], + cflt.padding[0]); + grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], + cflt.padding[1]); + megdnn_assert(diff[4] == 4); + grad[4] = 4; + } + grad.format = diff.format; + grad.init_contiguous_stride(); +} + +ConvolutionBackwardFilter::CanonizedFilterMeta +ConvolutionBackwardFilter::check_exec(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes) { + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT && + diff.dtype.category() == DTypeCategory::FLOAT && + grad.dtype.category() == DTypeCategory::FLOAT, + "only float type is supported for conv backward filter"); + auto ret = check_layout_fwd(src, grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/convolution3d.cpp b/dnn/src/common/convolution3d.cpp new file mode 100644 index 00000000..09850557 --- /dev/null +++ b/dnn/src/common/convolution3d.cpp @@ -0,0 +1,252 @@ +/** + * \file dnn/src/common/convolution3d.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +using namespace megdnn; + +namespace { +std::string get_errmsg(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, + const Convolution3D::Param& param) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + MEGDNN_MARK_USED_VAR(dst); + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(filter) + ", " + + megdnn_layout_msg(dst) + ", " + megdnn_mangle("is_ncdhw=") + + std::to_string(param.format == param::Convolution3D::Format::NCDHW) + + ", " + +megdnn_mangle("is_xcorr=") + + std::to_string( + (param.mode == Convolution3D::Mode::CROSS_CORRELATION)) + + ", " + megdnn_mangle("pad_d=") + std::to_string(param.pad_d) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param.pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param.pad_w) + ", " + + megdnn_mangle("stride_d=") + std::to_string(param.stride_d) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param.stride_h) + ", " + + megdnn_mangle("stride_w=") + std::to_string(param.stride_w) + ", " + + megdnn_mangle("dilate_d=") + std::to_string(param.dilate_d) + ", " + + megdnn_mangle("dilate_h=") + std::to_string(param.dilate_h) + ", " + + megdnn_mangle("dilate_w=") + std::to_string(param.dilate_w); +} +} // namespace + +Convolution3DBase::CanonizedFilterMeta +Convolution3DBase::make_canonized_filter_meta( + size_t src_ndim, const TensorLayout& filter) const { + megdnn_assert_contiguous(filter); + auto img_ndim = src_ndim - 2; + CanonizedFilterMeta ret; + ret.dtype_enum = filter.dtype.enumv(); + ret.format = param().format; + if (param().mode == Mode::CONVOLUTION) { + ret.should_flip = true; + } else { + megdnn_assert(param().mode == Mode::CROSS_CORRELATION, + "invalid conv mode"); + ret.should_flip = false; + } + size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; + MEGDNN_MARK_USED_VAR(flt_spatial_start); + MEGDNN_MARK_USED_VAR(ocpg_pos); + MEGDNN_MARK_USED_VAR(icpg_pos); + + if (param().sparse == Param::Sparse::DENSE) { + megdnn_assert(filter.ndim == img_ndim + 2, + "bad filter ndim for dense convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + ret.group = 1; + flt_start = 0; + } else { + megdnn_assert(param().sparse == Param::Sparse::GROUP, + "invalid convolution sparse type"); + megdnn_assert(filter.ndim == img_ndim + 3, + "bad filter ndim for group convolution: " + "spatial_ndim=%zu filter_ndim=%zu", + img_ndim, filter.ndim); + ret.group = filter[0]; + flt_start = 1; + } + + if (param().format == Param::Format::NCDHW) { + // filter should be (oc, ic, fd, fh, fw) + flt_spatial_start = 2; + ocpg_pos = 0; + icpg_pos = 1; + } else { + megdnn_assert(param().format == Param::Format::NDHWC, + "invalid conv tensor format"); + // filter should be (oc, fd, fh, fw, ic) + flt_spatial_start = 1; + ocpg_pos = 0; + icpg_pos = 4; + } + ret.spatial_ndim = src_ndim - 2; + megdnn_assert( + ret.spatial_ndim == 3, + "only 3D convolution is supported, and input should be 5-dim; " + "got input dim = %zu", + src_ndim); + ret.stride[0] = this->param().stride_d; + ret.stride[1] = this->param().stride_h; + ret.stride[2] = this->param().stride_w; + ret.padding[0] = this->param().pad_d; + ret.padding[1] = this->param().pad_h; + ret.padding[2] = this->param().pad_w; + ret.dilation[0] = param().dilate_d; + ret.dilation[1] = param().dilate_h; + ret.dilation[2] = param().dilate_w; + ret.ocpg = filter[flt_start + ocpg_pos]; + ret.icpg = filter[flt_start + icpg_pos]; + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(ret.dilation[i] > 0, + "invalid dilation on spatial dim %zu: %u", i, + ret.dilation[i]); + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * ret.dilation[i] + 1; + } + return ret; +} + +Convolution3DBase::CanonizedFilterMeta Convolution3DBase::deduce_layout_fwd( + const TensorLayout& src, const TensorLayout& filter, + TensorLayout& dst) const { + auto errmsg = [&]() { return get_errmsg(src, filter, dst, param()); }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); + megdnn_assert(src.ndim >= 5_z, "%s", errmsg().c_str()); + megdnn_assert(src.dtype == filter.dtype, "%s", errmsg().c_str()); + if (param().data_type == Param::DataType::FLOAT) { + megdnn_assert(src.dtype == dtype::Float32() MEGDNN_INC_FLOAT16( + || src.dtype == dtype::Float16()), + "invalid src dtype for conv: %s", src.dtype.name()); + dst.dtype = src.dtype; + } else { + megdnn_assert(param().data_type == Param::DataType::FLOAT_IO16xC32); + MEGDNN_INC_FLOAT16(megdnn_assert(src.dtype == dtype::Float16(), + "invalid src dtype for conv: %s", src.dtype.name())); + MEGDNN_INC_FLOAT16(dst.dtype = dtype::Float16()); + } + auto img_dim = src.ndim - 2; + megdnn_assert(img_dim == 3, "this is the convolution for 3D image"); + megdnn_assert(filter.ndim == img_dim + 2 || filter.ndim == img_dim + 3, + "%s", errmsg().c_str()); + auto cflt = make_canonized_filter_meta(src.ndim, filter); + size_t src_or_dst_c_pos = 0; + size_t src_or_dst_spatial_start = 0; + if (param().format == Param::Format::NCDHW) { + src_or_dst_c_pos = 1; + src_or_dst_spatial_start = 2; + } else { + megdnn_assert(param().format == Param::Format::NDHWC, + "invalid conv format"); + src_or_dst_c_pos = 4; + src_or_dst_spatial_start = 1; + } + megdnn_assert(cflt.icpg * cflt.group == src[src_or_dst_c_pos], "%s", + errmsg().c_str()); + dst.ndim = src.ndim; + dst[0] = src[0]; + dst[src_or_dst_c_pos] = cflt.ocpg * cflt.group; + for (size_t i = 0; i < cflt.spatial_ndim; ++i) { + dst[i + src_or_dst_spatial_start] = infer_conv_shape( + src[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], + cflt.stride[i], cflt.padding[i]); + } + dst.init_contiguous_stride(); + return cflt; +} + +Convolution3DBase::CanonizedFilterMeta Convolution3DBase::check_layout_fwd( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) const { + TensorLayout dst_expected; + auto ret = deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + return ret; +} + +void Convolution3DForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); +} + +Convolution3DBase::CanonizedFilterMeta Convolution3DForward::check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_in_bytes) { + auto ret = check_layout_fwd(src, filter, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +Convolution3DBase::CanonizedFilterMeta Convolution3DBackwardData::check_exec( + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes) { + megdnn_assert(param().data_type == Param::DataType::FLOAT, + "only float type is supported for conv backward"); + auto ret = check_layout_fwd(grad, filter, diff); + auto required_workspace_in_bytes = + get_workspace_in_bytes(filter, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +void Convolution3DBackwardData::deduce_layout(const TensorLayout& filter, + const TensorLayout& diff, + TensorLayout& grad) { + megdnn_assert(param().data_type == Param::DataType::FLOAT, + "only float type is supported for conv backward"); + auto errmsg = [&]() { return get_errmsg(filter, diff, grad, param()); }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(filter); + megdnn_assert_contiguous(diff); + megdnn_assert(filter.ndim == 5_z || filter.ndim == 6_z, "%s", + errmsg().c_str()); + megdnn_assert(diff.ndim == 5_z, "%s", errmsg().c_str()); + megdnn_assert(filter.dtype == diff.dtype, "%s", errmsg().c_str()); + + auto cflt = make_canonized_filter_meta(diff.ndim, filter); + megdnn_assert(cflt.ocpg * cflt.group == diff[1], "%s", errmsg().c_str()); + + auto deduce = [&errmsg](size_t out, size_t filter, size_t stride, + size_t pad) { + MEGDNN_MARK_USED_VAR(errmsg); + auto i = (out - 1) * stride + filter; + megdnn_assert(i > pad * 2, "%s", errmsg().c_str()); + return i - pad * 2; + }; + + grad.ndim = diff.ndim; + grad[0] = diff[0]; + grad[1] = cflt.group * cflt.icpg; + grad.dtype = diff.dtype; + for (size_t i = 0; i < cflt.spatial_ndim; ++i) { + grad[i + 2] = deduce(diff[i + 2], cflt.dilated_spatial[i], + cflt.stride[i], cflt.padding[i]); + } + grad.init_contiguous_stride(); +} + +Convolution3DBase::CanonizedFilterMeta Convolution3DBackwardFilter::check_exec( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_in_bytes) { + megdnn_assert(param().data_type == Param::DataType::FLOAT, + "only float type is supported for conv backward"); + auto ret = check_layout_fwd(src, grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cumsum.cpp b/dnn/src/common/cumsum.cpp new file mode 100644 index 00000000..5c87e811 --- /dev/null +++ b/dnn/src/common/cumsum.cpp @@ -0,0 +1,37 @@ +/** + * \file dnn/src/common/cumsum.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void CumsumForward::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + megdnn_assert_contiguous(src); + dst = src; +} + +void CumsumForward::check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + megdnn_assert_contiguous(src); + megdnn_assert_eq_layout(src, dst); + megdnn_assert(param().axis >= 0); + megdnn_assert(static_cast(param().axis) < src.ndim); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/aligned_allocator.h b/dnn/src/common/cv/aligned_allocator.h new file mode 100644 index 00000000..cb09411a --- /dev/null +++ b/dnn/src/common/cv/aligned_allocator.h @@ -0,0 +1,131 @@ +/** + * \file dnn/src/common/cv/aligned_allocator.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include +#include + +#include "megdnn/arch.h" + +#ifdef _MSC_VER +#include "malloc.h" +#endif + +#if defined(__ANDROID__) || defined(ANDROID) +#include "malloc.h" +#define HAS_MEMALIGN +#elif !defined(_MSC_VER) +#define HAS_POSIX_MEMALIGN +#endif + +namespace ah { +/** + * @tparam _Tp Type of allocated object. + * @tparam _align Alignment, in bytes. + */ +template +class aligned_allocator : public std::allocator<_Tp> { +public: + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template + struct rebind { + typedef aligned_allocator<_Tp1, _align> other; + }; + + typedef std::true_type propagate_on_container_move_assignment; + + aligned_allocator() MEGDNN_NOEXCEPT {} + + template + aligned_allocator(const aligned_allocator<_Tp1, _align>&) MEGDNN_NOEXCEPT {} + + ~aligned_allocator() MEGDNN_NOEXCEPT {} + + // NB: __n is permitted to be 0. The C++ standard says nothing + // about what the return value is when __n == 0. + pointer allocate(size_type __n, const void* = 0) { + if (__n > this->max_size()) + megdnn_trap(); + +#ifdef HAS_POSIX_MEMALIGN + _Tp* result; + if (posix_memalign(&(void*&)result, _align, __n * sizeof(_Tp)) != 0) { + if (_Nothrow) { + return nullptr; + } else { + megdnn_trap(); + } + } + return result; +#elif defined(HAS_MEMALIGN) + return (_Tp*)memalign(_align, __n * sizeof(_Tp)); +#elif defined(_MSC_VER) + return (_Tp*)_aligned_malloc(__n * sizeof(_Tp), _align); +#else +#warning \ + "aligned allocator fallbacks to normal malloc; allocated address may be unaligned" + return (_Tp*)malloc(__n * sizeof(_Tp)); +#endif + } + + // __p is not permitted to be a null pointer. + void deallocate(pointer __p, size_type) { +#ifdef _MSC_VER + _aligned_free((void*)__p); +#else + free((void*)__p); +#endif + } +}; + +template +inline bool operator==(const aligned_allocator<_T1, _A1>&, + const aligned_allocator<_T2, _A2>&) { + return true; +} + +template +inline bool operator!=(const aligned_allocator<_T1, _A1>&, + const aligned_allocator<_T2, _A2>&) { + return false; +} + +/// allocator specialization. +template +class aligned_allocator { +public: + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef void* pointer; + typedef const void* const_pointer; + typedef void value_type; + + template + struct rebind { + typedef aligned_allocator<_Tp1, _align> other; + }; + + typedef std::true_type propagate_on_container_move_assignment; +}; + +} // namespace ah + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/bordermode-inl.h b/dnn/src/common/cv/bordermode-inl.h new file mode 100644 index 00000000..c63e7771 --- /dev/null +++ b/dnn/src/common/cv/bordermode-inl.h @@ -0,0 +1,93 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/bordermode-inl.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +static inline int border_interpolate(int p, int len, BorderMode bmode) { + if ((unsigned)p < (unsigned)len) + ; + else if (bmode == BorderMode::BORDER_REPLICATE) + p = p < 0 ? 0 : len - 1; + else if (bmode == BorderMode::BORDER_REFLECT || + bmode == BorderMode::BORDER_REFLECT_101) { + int delta = (bmode == BorderMode::BORDER_REFLECT_101); + if (len == 1) + return 0; + do { + if (p < 0) + p = -p - 1 + delta; + else + p = len - 1 - (p - len) - delta; + } while ((unsigned)p >= (unsigned)len); + } else if (bmode == BorderMode::BORDER_WRAP) { + megdnn_assert(len > 0); + if (p < 0) + p -= ((p - len + 1) / len) * len; + while (p >= len) { + p -= len; + } + } else if (bmode == BorderMode::BORDER_CONSTANT || + bmode == BorderMode::BORDER_TRANSPARENT) + p = -1; + else + MegCVException("Unknown/unsupported border type"); + return p; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/common.h b/dnn/src/common/cv/common.h new file mode 100644 index 00000000..8deb702c --- /dev/null +++ b/dnn/src/common/cv/common.h @@ -0,0 +1,218 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/common.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "megdnn/basic_types.h" + +// for x86, armv7, armv8, naive +#define MEGCV_ENABLE_UNROLLED 1 + +namespace megdnn { +namespace megcv { + +class Size { +public: + Size(size_t rows, size_t cols) : m_rows(rows), m_cols(cols) {} + Size() : m_rows(0), m_cols(0) {} + + size_t rows() const { return m_rows; } + size_t& rows() { return m_rows; } + size_t cols() const { return m_cols; } + size_t& cols() { return m_cols; } + size_t height() const { return rows(); } + size_t& height() { return rows(); } + size_t width() const { return cols(); } + size_t& width() { return cols(); } + + bool operator==(const Size& rhs) const { + return rows() == rhs.rows() && cols() == rhs.cols(); + } + +private: + size_t m_rows, m_cols; +}; + +class MatShape : public Size { +public: + MatShape(size_t rows, size_t cols, size_t channels) + : Size(rows, cols), m_channels(channels) {} + + size_t channels() const { return m_channels; } + + bool operator==(const MatShape& rhs) const { + return Size::operator==(rhs) && channels() == rhs.channels(); + } + +private: + size_t m_channels; +}; + +/*! + * A row-major device matrix wrapper + */ +template +class Mat { +private: + size_t m_rows, m_cols; + size_t m_channels; + size_t m_step; + + std::shared_ptr m_data; + + size_t m_offset; + +public: + void* raw_ptr() { return static_cast(m_data.get() + m_offset); } + const void* raw_ptr() const { + return static_cast(m_data.get() + m_offset); + } + + Mat(); + Mat(size_t rows, size_t cols, size_t channels, size_t step); + Mat(size_t rows, size_t cols, size_t channels); + // do not try to manage data by shared_ptr + Mat(size_t rows, size_t cols, size_t channels, T* data); + Mat(size_t rows, size_t cols, size_t channels, size_t step, T* data); + // shallow-copy constructor + Mat(const Mat& rhs); + Mat(const Mat& rhs, size_t row_offset, size_t row_count, + size_t col_offset, size_t col_count); + Mat& operator=(const Mat& rhs); + + T& at(size_t r, size_t c, size_t ch); + const T& at(size_t r, size_t c, size_t ch) const; + + Mat clone() const; + + // read data from src + void read(const T* src); + // write data to dst + void write(T* dst) const; + + const T* ptr(size_t r = 0) const { + return static_cast(raw_ptr()) + r * m_step; + } + T* ptr(size_t r = 0) { return static_cast(raw_ptr()) + r * m_step; } + size_t height() const { return rows(); } + size_t width() const { return cols(); } + size_t rows() const { return m_rows; } + size_t cols() const { return m_cols; } + size_t channels() const { return m_channels; } + size_t step() const { return m_step; } + size_t total_nr_elem() const { return rows() * cols() * channels(); } + size_t total_span_elem() const { return rows() * step(); } + bool equals(const Mat& rhs) const; + bool is_continuous() const; + + Size size() const { return {rows(), cols()}; } + MatShape shape() const { return {rows(), cols(), channels()}; } +}; + +class Rect { +public: + size_t y, x, height, width; + Rect(size_t _y, size_t _x, size_t _height, size_t _width) + : y(_y), x(_x), height(_height), width(_width) {} + Rect() : y(0), x(0), height(0), width(0) {} +}; + +template +struct Point { + scalar_t x, y; + + Point() {} + Point(scalar_t x, scalar_t y) : x(x), y(y) {} + + Point operator+(const Point& rhs) const { return {x + rhs.x, y + rhs.y}; } + Point operator-(const Point& rhs) const { return {x - rhs.x, y - rhs.y}; } + Point operator*(scalar_t f) const { return {x * f, y * f}; } + Point operator/(scalar_t f) const { return {x / f, y / f}; } +}; + +template +Mat TensorND2Mat(const TensorND& tensor, size_t batch); + +// type aliases +using uchar = unsigned char; +using ushort = unsigned short; +using Mat8u = Mat; +using Mat32f = Mat; +using Mat64f = Mat; + +extern template class Mat; +extern template class Mat; +extern template class Mat; +extern template class Mat; +extern template class Mat; +extern template class Mat; + +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/cvt_color.h b/dnn/src/common/cv/cvt_color.h new file mode 100644 index 00000000..06bd04cd --- /dev/null +++ b/dnn/src/common/cv/cvt_color.h @@ -0,0 +1,70 @@ +/** + * \file dnn/src/common/cv/cvt_color.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#define GENERATE_CVT_OPR_DECL(_opr) \ + template \ + void _opr(const megcv::Mat& src, megcv::Mat& dst) + +#define GENERATE_CVT_OPR_DECL_FOREACH(_cb) \ + _cb(cvt_rgb2gray); \ + _cb(cvt_rgb2yuv); \ + _cb(cvt_yuv2rgb); \ + _cb(cvt_gray2rgb); \ + _cb(cvt_rgba2rgb); \ + _cb(cvt_rgba2bgr); \ + _cb(cvt_rgba2gray); \ + _cb(cvt_rgb2bgr); \ + _cb(cvt_bgr2gray); \ + _cb(cvt_bgr2rgb); \ + _cb(cvt_yuv2gray_nv21); \ + _cb(cvt_yuv2rgb_nv21); \ + _cb(cvt_yuv2bgr_nv21); \ + _cb(cvt_yuv2gray_nv12); \ + _cb(cvt_yuv2rgb_nv12); \ + _cb(cvt_yuv2bgr_nv12); \ + _cb(cvt_yuv2gray_yv12); \ + _cb(cvt_yuv2rgb_yv12); \ + _cb(cvt_yuv2bgr_yv12); \ + _cb(cvt_yuv2gray_yu12); \ + _cb(cvt_yuv2rgb_yu12); \ + _cb(cvt_yuv2bgr_yu12); + +#define descale(x, n) (((x) + (1 << ((n)-1))) >> (n)) + +#define GENERATE_UNSUPPORT_CVT_OPR_FOR_FLOAT(_cb) \ + _cb(cvt_rgba2rgb, float) \ + _cb(cvt_rgba2bgr, float) \ + _cb(cvt_rgba2gray, float) \ + _cb(cvt_rgb2bgr, float) \ + _cb(cvt_bgr2gray, float) \ + _cb(cvt_bgr2rgb, float) \ + _cb(cvt_yuv2gray_nv21, float) \ + _cb(cvt_yuv2rgb_nv21, float) \ + _cb(cvt_yuv2bgr_nv21, float) \ + _cb(cvt_yuv2gray_nv12, float) \ + _cb(cvt_yuv2rgb_nv12, float) \ + _cb(cvt_yuv2bgr_nv12, float) \ + _cb(cvt_yuv2gray_yv12, float) \ + _cb(cvt_yuv2rgb_yv12, float) \ + _cb(cvt_yuv2bgr_yv12, float) \ + _cb(cvt_yuv2gray_yu12, float) \ + _cb(cvt_yuv2rgb_yu12, float) \ + _cb(cvt_yuv2bgr_yu12, float) + +#define GENERATE_UNSUPPORT_CVT_OPR(_opr, _type) \ + template <> \ + void _opr<_type>(const megcv::Mat<_type>&, megcv::Mat<_type>&) { \ + MegCVException("There is not a cvt_opr " #_opr \ + " to deal with " #_type); \ + } + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/enums.h b/dnn/src/common/cv/enums.h new file mode 100644 index 00000000..498385b3 --- /dev/null +++ b/dnn/src/common/cv/enums.h @@ -0,0 +1,30 @@ +/** + * \file dnn/src/common/cv/enums.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +enum BorderMode { + BORDER_REPLICATE = 0, + BORDER_REFLECT = 1, + BORDER_REFLECT_101 = 2, + BORDER_WRAP = 3, + BORDER_CONSTANT = 4, + BORDER_TRANSPARENT = 5, + BORDER_ISOLATED = 6 +}; +enum InterpolationMode { + INTER_NEAREST = 0, + INTER_LINEAR = 1, + INTER_AREA = 2, + INTER_CUBIC = 3, + INTER_LANCZOS4 = 4 +}; + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/filter.cpp b/dnn/src/common/cv/filter.cpp new file mode 100644 index 00000000..7e2e6297 --- /dev/null +++ b/dnn/src/common/cv/filter.cpp @@ -0,0 +1,305 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/filter.cpp + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +#include "./filter.h" + +namespace megdnn { +namespace megcv { +namespace filter_common { + +#define VEC_ALIGN 16 + +template +FilterEngine::FilterEngine(BaseRowFilter* row_filter, + BaseColumnFilter* column_filter, size_t ch, + const ST* border_value, BorderMode bmode) + : m_row_filter(row_filter), + m_column_filter(column_filter), + m_ch(ch), + m_bmode(bmode) { + megdnn_assert(m_row_filter && m_column_filter); + megdnn_assert(m_bmode != BorderMode::BORDER_WRAP); + + m_ksize.cols() = m_row_filter->ksize; + m_ksize.rows() = m_column_filter->ksize; + m_anchor.x = m_row_filter->anchor; + m_anchor.y = m_column_filter->anchor; + m_buf_step = 0; + + //! the anchor must be in the kernerl + megdnn_assert(0 <= m_anchor.x && m_anchor.x < m_ksize.cols() && + 0 <= m_anchor.y && m_anchor.y < m_ksize.rows()); + + int src_elem_size = (int)sizeof(ST) * m_ch; + m_border_elem_size = src_elem_size / ((sizeof(ST) >= 4) ? sizeof(int) : 1); + int border_length = std::max((int)(m_ksize.cols() - 1), (int)1); + m_border_table.resize(border_length * m_border_elem_size); + + if (m_bmode == BorderMode::BORDER_CONSTANT) { + //! store the border_value array to m_const_border_value, the type + //! of buffer and image may be different, So use byte to store + m_const_border_value.resize(m_ch * sizeof(ST) * border_length); + for (int i = 0; i < src_elem_size * border_length; i += src_elem_size) + for (int j = 0; j < src_elem_size; j++) + m_const_border_value[i + j] = ((uchar*)(border_value))[j]; + } + m_whole_size = Size(-1, -1); +} + +template +FilterEngine::~FilterEngine() { + if (m_row_filter != NULL) + delete m_row_filter; + if (m_column_filter != NULL) + delete m_column_filter; +} + +template +void FilterEngine::start(const Mat& src) { + m_whole_size.cols() = src.cols(); + m_whole_size.rows() = src.rows(); + + int element_size = (int)sizeof(ST) * m_ch; + int buf_elem_size = (int)sizeof(FT) * m_ch; + + int cn = m_ch; + m_src_row.resize(element_size * (m_whole_size.width() + m_ksize.width() - 1)); + if (m_bmode == BorderMode::BORDER_CONSTANT) { + m_const_border_row.resize( + buf_elem_size * + (m_whole_size.width() + m_ksize.width() - 1 + VEC_ALIGN)); + uchar *dst = align_ptr(&m_const_border_row[0], VEC_ALIGN), *tdst; + int n = (int)m_const_border_value.size(), N; + N = (m_whole_size.width() + m_ksize.width() - 1) * element_size; + tdst = &m_src_row[0]; + + for (int i = 0; i < N; i += n) { + n = std::min((int)n, (int)(N - i)); + for (int j = 0; j < n; j++) + tdst[i + j] = m_const_border_row[j]; + } + + (*m_row_filter)(&m_src_row[0], dst, m_whole_size.width(), cn); + } + + + m_buf_step = buf_elem_size * + (int)align_size(m_whole_size.width() + m_ksize.width() - 1, + VEC_ALIGN); + m_ring_buf.resize(m_buf_step * m_ksize.height() + VEC_ALIGN); + m_left_width = m_anchor.x; + m_right_width = m_ksize.width() - m_anchor.x - 1; + + //! init the row with border values + if (m_left_width > 0 || m_right_width > 0) { + //! calc the index of the border value, we will not calc it when process + //! border each time + if (m_bmode == BorderMode::BORDER_CONSTANT) { + memcpy(m_src_row.data(), m_const_border_row.data(), + m_left_width * element_size); + memcpy(m_src_row.data() + + (m_whole_size.width() + m_left_width) * element_size, + m_const_border_row.data(), m_right_width * element_size); + } else { + //! calc the index of the border value, we will not calc it when + //! process border each time + for (int i = 0; i < m_left_width; i++) { + int p0 = gaussian_blur::border_interpolate(i - m_left_width, + m_whole_size.width(), m_bmode) * + m_border_elem_size; + for (int j = 0; j < m_border_elem_size; j++) + m_border_table[i * m_border_elem_size + j] = p0 + j; + } + + for (int i = 0; i < m_right_width; i++) { + int p0 = gaussian_blur::border_interpolate(m_whole_size.width() + i, + m_whole_size.width(), m_bmode) * + m_border_elem_size; + for (int j = 0; j < m_border_elem_size; j++) + m_border_table[(i + m_left_width) * m_border_elem_size + + j] = p0 + j; + } + } + } + + if (m_column_filter) + m_column_filter->reset(); +} + +template +int FilterEngine::proceed(const uchar* src, int srcstep, int count, + uchar* dst, int dststep) { + const int* btab = &m_border_table[0]; + int src_elem_size = static_cast(sizeof(ST) * m_ch); + bool makeBorder = (m_left_width > 0 || m_right_width > 0) && + m_bmode != BorderMode::BORDER_CONSTANT; + int dy = 0, i = 0; + + int row_count = 0; + int start_y = 0; + std::vector buf_rows(m_ksize.rows(), nullptr); + for (;; dst += dststep * i, dy += i) { + int dcount = m_ksize.height() - m_anchor.y - start_y - row_count; + dcount = dcount > 0 ? dcount : 1; + dcount = std::min(dcount, count); + count -= dcount; + for (; dcount-- > 0; src += srcstep) { + int bi = (start_y + row_count) % m_ksize.height(); + uchar* brow = + align_ptr(&m_ring_buf[0], VEC_ALIGN) + bi * m_buf_step; + uchar* row = &m_src_row[0]; + + if (++row_count > static_cast(m_ksize.height())) { + --row_count; + ++start_y; + } + + memcpy(row + m_left_width * src_elem_size, src, + m_whole_size.width() * src_elem_size); + + if (makeBorder) { + if (m_border_elem_size * static_cast(sizeof(int)) == + src_elem_size) { + const int* isrc = reinterpret_cast(src); + int* irow = reinterpret_cast(row); + + for (int i = 0; i < m_left_width * m_border_elem_size; i++) + irow[i] = isrc[btab[i]]; + for (int i = 0; i < m_right_width * m_border_elem_size; + i++) { + irow[i + (m_whole_size.width() + m_left_width) * + m_border_elem_size] = + isrc[btab[i + + m_left_width * m_border_elem_size]]; + } + } else { + for (int i = 0; i < m_left_width * src_elem_size; i++) + row[i] = src[btab[i]]; + for (int i = 0; i < m_right_width * src_elem_size; i++) + row[i + (m_whole_size.width() + m_left_width) * + src_elem_size] = + src[btab[i + m_left_width * src_elem_size]]; + } + } + + (*m_row_filter)(row, brow, m_whole_size.width(), m_ch); + } + + int max_i = std::min( + m_ksize.height(), + m_whole_size.height() - dy + (m_ksize.height() - 1)); + for (i = 0; i < max_i; i++) { + int src_y = gaussian_blur::border_interpolate(dy + i - m_anchor.y, + m_whole_size.rows(), m_bmode); + if (src_y < 0) + buf_rows[i] = align_ptr(&m_const_border_row[0], VEC_ALIGN); + else { + megdnn_assert(src_y >= start_y); + if (src_y >= start_y + row_count) { + break; + } + int bi = src_y % m_ksize.height(); + buf_rows[i] = + align_ptr(&m_ring_buf[0], VEC_ALIGN) + bi * m_buf_step; + } + } + if (i < static_cast(m_ksize.height())) { + break; + } + i -= m_ksize.height() - 1; + (*m_column_filter)(const_cast(&buf_rows[0]), dst, + dststep, i, m_whole_size.width() * m_ch); + } + + return dy; +} + +template +void FilterEngine::apply(const Mat& src, Mat& dst) { + int src_step = src.step() * sizeof(ST); + int dst_step = dst.step() * sizeof(ST); + start(src); + proceed(reinterpret_cast(src.ptr()), + static_cast(src_step), m_whole_size.height(), + reinterpret_cast(dst.ptr()), static_cast(dst_step)); +} + +//! explicit instantiation template +template FilterEngine::FilterEngine( + BaseRowFilter* _rowFilter, BaseColumnFilter* _columnFilter, size_t _CH, + const uchar* _borderValue, BorderMode _BorderType); +template FilterEngine::FilterEngine( + BaseRowFilter* _rowFilter, BaseColumnFilter* _columnFilter, size_t _CH, + const float* _borderValue, BorderMode _BorderType); + +template void FilterEngine::apply(const Mat& src, + Mat& dst); +template void FilterEngine::apply(const Mat& src, + Mat& dst); + +template FilterEngine::~FilterEngine(); +template FilterEngine::~FilterEngine(); + +} // namespace filter_common +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/filter.h b/dnn/src/common/cv/filter.h new file mode 100644 index 00000000..e71f7e54 --- /dev/null +++ b/dnn/src/common/cv/filter.h @@ -0,0 +1,552 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/filter.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ +#pragma once + +#include "src/common/cv/common.h" +#include "src/common/cv/helper.h" +#include "src/common/utils.h" + +#include + +namespace megdnn { +namespace megcv { +namespace filter_common { + +using BorderMode = param::WarpPerspective::BorderMode; + +/* ============================ vecOp ============================== */ + +/*! + * \struct RowNoVec + * \brief Filter a row using the kernel. + */ +struct RowNoVec { + RowNoVec() {} + /*! + * \param kernel The filter kernel + * \param ksize The size of the kernel + */ + RowNoVec(const uchar* /*kernel*/, int /*ksize*/) {} + + /*! + * \param src The src data + * \param dst The dst data + * \param width The width of the src + * \param cn The channel size + */ + int operator()(const uchar* /*src*/, uchar* /*dst*/, int /*width*/, + int /*cn*/) const { + return 0; + } +}; + +/*! + * \struct ColumnNoVec + * \brief Filter a column using the kernel. + */ +struct ColumnNoVec { + ColumnNoVec() {} + /*! + * \param kernel The filter kernel + * \param ksize The size of the kernel + * \param bits The bits shift, Used only if the type is \c uint8_t + */ + ColumnNoVec(const uchar* /*kernel*/, int /*ksize*/, int /*bits*/) {} + + /*! + * \param src The src data + * \param dst The dst data + * \param count The count of rows that this column kernel processed. + * \param width The width of the src + */ + int operator()(const uchar** /*src*/, uchar* /*dst*/, int& /*count*/, + int /*width*/) const { + return 0; + } +}; + +/*! + * \struct SymmRowSmallFilter + * \brief Filter a row using the kernel, used if the kernel is symmetry. + */ +struct SymmRowSmallNoVec { + SymmRowSmallNoVec() {} + SymmRowSmallNoVec(const uchar*, int) {} + int operator()(const uchar*, uchar*, int, int) const { return 0; } +}; + +struct SymmColumnSmallNoVec { + SymmColumnSmallNoVec() {} + SymmColumnSmallNoVec(const uchar*, int, int) {} + int operator()(const uchar**, uchar*, int&, int) const { return 0; } +}; + +/* ============================ Filters ============================== */ + +class BaseRowFilter { +public: + BaseRowFilter() { ksize = anchor = -1; } + virtual ~BaseRowFilter() {} + + //! the filtering operator. Must be overridden in the derived classes. The + //! horizontal border interpolation is done outside of the class. + virtual void operator()(const uchar* src, uchar* dst, int width, + int cn) = 0; + + //! The size of the kernel + int ksize; + //! The center of the filter, e.g. gaussian blur, anchor is ksize / 2 + int anchor; +}; + +class BaseColumnFilter { +public: + BaseColumnFilter() { ksize = anchor = -1; } + virtual ~BaseColumnFilter() {} + + //! the filtering operator. Must be overridden in the derived classes. The + //! vertical border interpolation is done outside of the class. + virtual void operator()(const uchar** src, uchar* dst, int dststep, + int dstcount, int width) = 0; + //! resets the internal buffers, if any + virtual void reset() {} + + //! The size of the kernel + int ksize; + //! The center of the filter, e.g. gaussian blur, anchor is ksize / 2 + int anchor; +}; + +/*! + * \struct RowFilter + * \brief The filter of the row + * \tparam ST the type of src + * \tparam DT the type of dst + * \tparam VecOp process the element using vectorized operator. + */ +template +struct RowFilter : public BaseRowFilter { + RowFilter(const Mat
& kernel_, int anchor_, + const VecOp& vec_op_ = VecOp()) { + anchor = anchor_; + kernel = kernel_.clone(); + ksize = kernel.cols(); + vec_op = vec_op_; + } + + void operator()(const uchar* src, uchar* dst, int width, int cn) { + const DT* kx = kernel.ptr(); + const ST* S; + DT* D = reinterpret_cast(dst); + int i, k; + + i = vec_op(src, dst, width, cn); + width *= cn; +#if MEGCV_ENABLE_UNROLLED + for (; i <= width - 4; i += 4) { + S = reinterpret_cast(src) + i; + DT f = kx[0]; + DT s0 = f * S[0], s1 = f * S[1], s2 = f * S[2], s3 = f * S[3]; + + for (k = 1; k < ksize; k++) { + S += cn; + f = kx[k]; + s0 += f * S[0]; + s1 += f * S[1]; + s2 += f * S[2]; + s3 += f * S[3]; + } + + D[i] = s0; + D[i + 1] = s1; + D[i + 2] = s2; + D[i + 3] = s3; + } +#endif + for (; i < width; i++) { + S = reinterpret_cast(src) + i; + DT s0 = kx[0] * S[0]; + for (k = 1; k < ksize; k++) { + S += cn; + s0 += kx[k] * S[0]; + } + D[i] = s0; + } + } + + //! The kernel used in RowFilter + Mat
kernel; + //! The vectorized operator used in RowFilter + VecOp vec_op; +}; + +template +struct SymmRowSmallFilter : public RowFilter { + SymmRowSmallFilter(const Mat
& kernel_, int anchor_, + const VecOp& vec_op_ = VecOp()) + : RowFilter(kernel_, anchor_, vec_op_) {} + + void operator()(const uchar* src, uchar* dst, int width, int cn) { + int ksize2 = this->ksize / 2, ksize2n = ksize2 * cn; + const DT* kx = this->kernel.ptr() + ksize2; + DT* D = reinterpret_cast(dst); + int i = this->vec_op(src, dst, width, cn), j, k; + + //! The center + const ST* S = reinterpret_cast(src) + i + ksize2n; + width *= cn; + + if (this->ksize == 1 && kx[0] == 1) { + for (; i <= width - 2; i += 2) { + DT s0 = S[i], s1 = S[i + 1]; + D[i] = s0; + D[i + 1] = s1; + } + S += i; + } else if (this->ksize == 3) { + DT k0 = kx[0], k1 = kx[1]; + for (; i <= width - 2; i += 2, S += 2) { + DT s0 = S[0] * k0 + (S[-cn] + S[cn]) * k1, + s1 = S[1] * k0 + (S[1 - cn] + S[1 + cn]) * k1; + D[i] = s0; + D[i + 1] = s1; + } + } else if (this->ksize == 5) { + DT k0 = kx[0], k1 = kx[1], k2 = kx[2]; + for (; i <= width - 2; i += 2, S += 2) { + DT s0 = S[0] * k0 + (S[-cn] + S[cn]) * k1 + + (S[-cn * 2] + S[cn * 2]) * k2; + DT s1 = S[1] * k0 + (S[1 - cn] + S[1 + cn]) * k1 + + (S[1 - cn * 2] + S[1 + cn * 2]) * k2; + D[i] = s0; + D[i + 1] = s1; + } + } + + for (; i < width; i++, S++) { + DT s0 = kx[0] * S[0]; + for (k = 1, j = cn; k <= ksize2; k++, j += cn) + s0 += kx[k] * (S[j] + S[-j]); + D[i] = s0; + } + + } +}; + +template +struct ColumnFilter : public BaseColumnFilter { + typedef typename CastOp::type1 ST; + typedef typename CastOp::rtype DT; + + ColumnFilter(const Mat& kernel_, int anchor_, + const CastOp& cast_op_ = CastOp(), + const VecOp& vec_op_ = VecOp()) { + kernel = kernel_.clone(); + anchor = anchor_; + ksize = kernel.cols(); + cast_op = cast_op_; + vec_op = vec_op_; + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + const ST* ky = this->kernel.ptr(); + int i = 0, k; + CastOp castOp = this->cast_op; + { + for( ; count > 0; count--, dst += dststep, src++ ) + { + DT* D = (DT*)dst; + i = (this->vec_op)(src, dst, count, width); +#if MEGCV_ENABLE_UNROLLED + for( ; i <= width - 4; i += 4 ) + { + ST f = ky[0]; + const ST* S = (const ST*)src[0] + i; + ST s0 = f*S[0], s1 = f*S[1], + s2 = f*S[2], s3 = f*S[3]; + + for( k = 1; k < ksize; k++ ) + { + S = (const ST*)src[k] + i; + f = ky[k]; + s0 += f*S[0]; + s1 += f*S[1]; + s2 += f*S[2]; + s3 += f*S[3]; + } + + D[i] = castOp(s0); D[i+1] = castOp(s1); + D[i+2] = castOp(s2); D[i+3] = castOp(s3); + } +#endif + for( ; i < width; i++ ) + { + ST s0 = 0; + for( k = 0; k < ksize; k++ ) { + s0 += ky[k]* ((const ST*)src[k])[i]; + } + D[i] = castOp(s0); + } + } + } + } + + Mat kernel; + CastOp cast_op; + VecOp vec_op; +}; + +template +struct SymmColumnFilter : public ColumnFilter { + typedef typename CastOp::type1 ST; + typedef typename CastOp::rtype DT; + + SymmColumnFilter(const Mat& kernel_, int anchor_, + const CastOp& cast_op_ = CastOp(), + const VecOp& vec_op_ = VecOp()) + : ColumnFilter(kernel_, anchor_, cast_op_, + vec_op_) { + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, + int width) { + int ksize2 = this->ksize / 2; + const ST* ky = this->kernel.ptr() + ksize2; + int i, k; + src += ksize2; + + for (; count > 0; count--, dst += dststep, src++) { + DT* D = (DT*)dst; + i = (this->vec_op)(src, dst, count, width); +#if MEGCV_ENABLE_UNROLLED + for (; i <= width - 4; i += 4) { + ST f = ky[0]; + const ST *S = (const ST*)src[0] + i, *S2; + ST s0 = f * S[0], s1 = f * S[1], s2 = f * S[2], s3 = f * S[3]; + + for (k = 1; k <= ksize2; k++) { + S = (const ST*)src[k] + i; + S2 = (const ST*)src[-k] + i; + f = ky[k]; + s0 += f * (S[0] + S2[0]); + s1 += f * (S[1] + S2[1]); + s2 += f * (S[2] + S2[2]); + s3 += f * (S[3] + S2[3]); + } + + D[i] = this->cast_op(s0); + D[i + 1] = this->cast_op(s1); + D[i + 2] = this->cast_op(s2); + D[i + 3] = this->cast_op(s3); + } +#endif + for (; i < width; i++) { + ST s0 = ky[0] * ((const ST*)src[0])[i]; + for (k = 1; k <= ksize2; k++) { + s0 += ky[k] * + (((const ST*)src[k])[i] + ((const ST*)src[-k])[i]); + } + D[i] = this->cast_op(s0); + } + } + } +}; + +template +struct SymmColumnSmallFilter : public SymmColumnFilter { + typedef typename CastOp::type1 ST; + typedef typename CastOp::rtype DT; + + SymmColumnSmallFilter(const Mat& kernel_, int anchor_, + const CastOp& cast_op_ = CastOp(), + const VecOp& vec_op_ = VecOp()) + : SymmColumnFilter(kernel_, anchor_, cast_op_, + vec_op_) { + //! \warning Only process if the kernel size is 3 + megdnn_assert(this->ksize == 3); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, + int width) { + int ksize2 = this->ksize / 2; + const ST* ky = this->kernel.ptr() + ksize2; + int i; + ST f0 = ky[0], f1 = ky[1]; + src += ksize2; + + if (std::is_same::value && std::is_same::value) { + (this->vec_op)(src, dst, count, width); + } + + for (; count > 0; count--, dst += dststep, src++) { + DT* D = (DT*)dst; + i = (this->vec_op)(src, dst, count, width); + if (count == 0) + break; + const ST* S0 = (const ST*)src[-1]; + const ST* S1 = (const ST*)src[0]; + const ST* S2 = (const ST*)src[1]; + + { +#if MEGCV_ENABLE_UNROLLED + for (; i <= width - 4; i += 4) { + ST s0 = (S0[i] + S2[i]) * f1 + S1[i] * f0; + ST s1 = (S0[i + 1] + S2[i + 1]) * f1 + S1[i + 1] * f0; + D[i] = this->cast_op(s0); + D[i + 1] = this->cast_op(s1); + + s0 = (S0[i + 2] + S2[i + 2]) * f1 + S1[i + 2] * f0; + s1 = (S0[i + 3] + S2[i + 3]) * f1 + S1[i + 3] * f0; + D[i + 2] = this->cast_op(s0); + D[i + 3] = this->cast_op(s1); + } +#endif + for (; i < width; i++) { + ST s0 = (S0[i] + S2[i]) * f1 + S1[i] * f0; + D[i] = this->cast_op(s0); + } + } + } + } +}; + +/* ============================ Filter Engine ========================= */ + +/*! + * \brief The common class for filtering the image. First filter the image using + * row filter and store in buffer data, and then using column filter. + * \tparam ST The image data type + * \tparam FT The inner buffer data type. + * + * \note As for uint8_t type, we may use int to store the buffer, which calc the + * product of the image and the filter kernel. + */ +template +class FilterEngine { +public: + FilterEngine() = default; + /*! + * \brief Init the filter and border. + * \warning row_filter and column_filter must be non-null + */ + FilterEngine(BaseRowFilter* row_filter, BaseColumnFilter* column_filter, + size_t ch, const ST* border_value, BorderMode bmode); + + //! the destructor + ~FilterEngine(); + //! applies filter to the the whole image. + void apply(const Mat& src, Mat& dst); + +private: + //! starts filtering of the src image. + void start(const Mat& src); + //! processes the next srcCount rows of the image. + int proceed(const uchar* src, int srcStep, int srcCount, uchar* dst, + int dstStep); + + //! row filter filter + BaseRowFilter* m_row_filter; + //! column filter filter + BaseColumnFilter* m_column_filter; + //! the channel of the image + size_t m_ch; + BorderMode m_bmode; + + //! the size of the kernel + Size m_ksize; + + //! the center of kernel, e.g GuassianBlur m_anchor is (kernel_row/2, + //! kernel_column/2) + Point m_anchor; + + //! the whole size. + Size m_whole_size; + //! store the border value, if sizeof(src_type) >= 4, + std::vector m_border_table; + //! nr of border value + int m_border_elem_size; + + //! the step of the buffer data. + int m_buf_step; + + //! store the border value, The size is ksize.cols - 1 + std::vector m_const_border_value; + //! store the total row if the border is BORDER_CONSTANT, the size is + //! image_width + kernel_width - 1, which include the row and the border. + std::vector m_const_border_row; + //! store the total row if the border is not BORDER_CONSTANT + std::vector m_src_row; + + //! store the kernel_height rows data. + std::vector m_ring_buf; + + //! the border left width, equal to m_anchor.x + int m_left_width; + //! equal to m_ksize.width() - m_left_width - 1 + int m_right_width; +}; + +} // namespace filter_common +} // namespace megcv +} // namespace megdnn + +// vim: filetype=cpp.doxygen diff --git a/dnn/src/common/cv/helper.h b/dnn/src/common/cv/helper.h new file mode 100644 index 00000000..f7069e1e --- /dev/null +++ b/dnn/src/common/cv/helper.h @@ -0,0 +1,281 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/helper.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +#pragma once + +#include +#include +#include +#include + +#include "./aligned_allocator.h" +#include "./common.h" +#include "src/common/utils.h" + +#include "megdnn/basic_types.h" +#include "megdnn/opr_param_defs.h" + +#if defined(__SSE2__) +#include +#endif + +#define MegCVException(expr) \ + do { \ + megdnn_throw(megdnn_mangle(#expr)); \ + } while (0) + +namespace megdnn { + +namespace megcv { + +template +using AlignedVector = std::vector>; + +static inline size_t align_size(size_t sz, int n) { + megdnn_assert((n & (n - 1)) == 0); + return (sz + n - 1) & -n; +} + +static inline int clip(int x, int a, int b) { + return x >= a ? (x < b ? x : b - 1) : a; +} + +template +static inline _Tp* align_ptr(_Tp* ptr, int n = (int)sizeof(_Tp)) { + return (_Tp*)(((size_t)ptr + n - 1) & -n); +} + +template +inline T saturate(T x, T lower, T upper) { + return (x < lower ? lower : (x >= upper ? upper - 1 : x)); +} + +// common functions +template +T modf(T x, T* iptr) { + T ival; + T rval(std::modf(x, &ival)); + *iptr = ival; + return rval; +} + +template +int round(T value) { + T intpart, fractpart; + fractpart = modf(value, &intpart); + if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0)) + return (int)(value + (value >= 0 ? 0.5 : -0.5)); + else + return (int)intpart; +} +template +static inline DT saturate_cast(ST x) { + return x; +} + +template <> +inline unsigned char saturate_cast(int x) { + return (unsigned char)((unsigned)x <= UCHAR_MAX ? x + : x > 0 ? UCHAR_MAX : 0); +} + +template <> +inline short saturate_cast(int x) { + return (short)((unsigned)(x - SHRT_MIN) <= (unsigned)USHRT_MAX + ? x + : x > 0 ? SHRT_MAX : SHRT_MIN); +} + +template +static inline int cv_round(ST value); + +template <> +inline int cv_round(float value) { +#if defined(__SSE2__) + __m128 t = _mm_set_ss(value); + return _mm_cvtss_si32(t); +#elif defined(__GNUC__) + return (int)lrintf(value); +#else + /* it's ok if round does not comply with IEEE754 standard; + the tests should allow +/-1 difference when the tested functions use round + */ + return (int)(value + (value >= 0 ? 0.5f : -0.5f)); +#endif +} + +template <> +inline int cv_round(double value) { +#if defined(__SSE2__) + __m128d t = _mm_set_sd(value); + return _mm_cvtsd_si32(t); +#elif defined(__GNUC__) + return (int)lrint(value); +#else + /* it's ok if round does not comply with IEEE754 standard; + the tests should allow +/-1 difference when the tested functions use round + */ + return (int)(value + (value >= 0 ? 0.5f : -0.5f)); +#endif +} + +template <> +inline int saturate_cast(float x) { + return cv_round(x); +} + +template <> +inline short saturate_cast(float x) { + return saturate_cast(saturate_cast(x)); +} + +template <> +inline int saturate_cast(double x) { + return cv_round(x); +} + +template +struct FixedPtCast { + typedef ST type1; + typedef DT rtype; + enum { SHIFT = bits, DELTA = 1 << (bits - 1) }; + + DT operator()(ST val) const { + return saturate_cast
((val + DELTA) >> SHIFT); + } +}; + +template +struct FixedPtCastEx { + typedef ST type1; + typedef DT rtype; + + FixedPtCastEx() : SHIFT(0), DELTA(0) {} + FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits - 1) : 0) {} + DT operator()(ST val) const { return saturate_cast
(val + DELTA); } + int SHIFT, DELTA; +}; + +template <> +struct FixedPtCastEx { + typedef int type1; + typedef uchar rtype; + + FixedPtCastEx() : SHIFT(0), DELTA(0) {} + FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits - 1) : 0) {} + uchar operator()(int val) const { + return saturate_cast((val + DELTA) >> SHIFT); + } + int SHIFT, DELTA; +}; + +template +struct Cast { + typedef ST type1; + typedef DT rtype; + + DT operator()(ST val) const { return saturate_cast
(val); } +}; + +template +static inline int border_interpolate(int p, int len) { + using BorderMode = param::WarpPerspective::BorderMode; + if ((unsigned)p < (unsigned)len) + ; + else if (bmode == BorderMode::BORDER_REPLICATE) + p = p < 0 ? 0 : len - 1; + else if (bmode == BorderMode::BORDER_REFLECT || + bmode == BorderMode::BORDER_REFLECT_101) { + int delta = (bmode == BorderMode::BORDER_REFLECT_101); + if (len == 1) + return 0; + do { + if (p < 0) + p = -p - 1 + delta; + else + p = len - 1 - (p - len) - delta; + } while ((unsigned)p >= (unsigned)len); + } else if (bmode == BorderMode::BORDER_WRAP) { + if (p < 0) + p -= ((p - len + 1) / len) * len; + while (p >= len) { + p -= len; + } + } else if (bmode == BorderMode::BORDER_CONSTANT || + bmode == BorderMode::BORDER_TRANSPARENT) + p = -1; + else + megdnn_throw("Unknown/unsupported border type"); + return p; +} + +namespace gaussian_blur { + +using BorderMode = param::GaussianBlur::BorderMode; + +#include "./bordermode-inl.h" + +} // namespace gaussian_blur + +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/interp_helper.cpp b/dnn/src/common/cv/interp_helper.cpp new file mode 100644 index 00000000..91f8d2ec --- /dev/null +++ b/dnn/src/common/cv/interp_helper.cpp @@ -0,0 +1,257 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/interp_helper.cpp + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +#pragma GCC diagnostic ignored "-Wnon-virtual-dtor" +// TableHolderBase has no problem; ignore the warning for old clang versions + +#include "./helper.h" +#include "./interp_helper.h" + +#include "src/common/utils.h" + +using namespace megdnn; +using namespace megdnn::megcv; + +static constexpr double MEGCV_PI_4 = 0.78539816339744830962; /* pi/4 */ + +#define DEF_FUN(_ret) \ + template \ + _ret InterpolationTable:: + +#define DEF_TABLE_HOLDER(_name, _ksize) \ + template \ + typename InterpolationTable< \ + INTER_BITS_, INTER_MAX_, \ + INTER_REMAP_COEF_BITS_>::template TableHolder<_ksize> \ + InterpolationTable::_name + +DEF_TABLE_HOLDER(sm_tab_linear, 2); +DEF_TABLE_HOLDER(sm_tab_cubic, 4); +DEF_TABLE_HOLDER(sm_tab_lanczos4, 8); + +DEF_FUN(void) interpolate_linear(float x, float* coeffs) { + coeffs[0] = 1.f - x; + coeffs[1] = x; +} + +DEF_FUN(void) interpolate_cubic(float x, float* coeffs) { + const float A = -0.75f; + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +DEF_FUN(void) interpolate_lanczos4(float x, float* coeffs) { + static const double s45 = 0.70710678118654752440084436210485; + static const double cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, + {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; + if (x < FLT_EPSILON) { + for (int i = 0; i < 8; i++) + coeffs[i] = 0; + coeffs[3] = 1; + return; + } + float sum = 0; + double y0 = -(x + 3) * MEGCV_PI_4, s0 = sin(y0), c0 = cos(y0); + for (int i = 0; i < 8; i++) { + double y = -(x + 3 - i) * MEGCV_PI_4; + coeffs[i] = (float)((cs[i][0] * s0 + cs[i][1] * c0) / (y * y)); + sum += coeffs[i]; + } + sum = 1.f / sum; + for (int i = 0; i < 8; i++) + coeffs[i] *= sum; +} + +DEF_FUN(void) +init_inter_tab_1d(InterpolationMode imode, float* tab, int tabsz) { + float scale = 1.f / tabsz; + switch (imode) { + case IMode::INTER_LINEAR: + for (int i = 0; i < tabsz; ++i, tab += 2) + interpolate_linear(i * scale, tab); + break; + case IMode::INTER_CUBIC: + for (int i = 0; i < tabsz; ++i, tab += 4) + interpolate_cubic(i * scale, tab); + break; + case IMode::INTER_LANCZOS4: + for (int i = 0; i < tabsz; ++i, tab += 8) + interpolate_lanczos4(i * scale, tab); + break; + default: + megdnn_throw("unsupported interpolation mode"); + } +} + +#if MEGDNN_X86 +DEF_FUN(const int16_t*) get_linear_ic4_table() { + auto table_holder = &sm_tab_linear; + std::lock_guard lg{table_holder->mtx}; + float* tab = nullptr; + short* itab = nullptr; + MEGDNN_MARK_USED_VAR(tab); + MEGDNN_MARK_USED_VAR(itab); + megdnn_assert(table_holder->get(&tab, &itab), + "invoke get_table before get_linear_ic4_table"); + return table_holder->table->bilineartab_ic4_buf; +} +#endif + +DEF_FUN(const void*) get_table(InterpolationMode imode, bool fixpt) { + TableHolderBase* table_holder = nullptr; + int ksize = 0; + switch (imode) { + case IMode::INTER_LINEAR: + table_holder = &sm_tab_linear; + ksize = 2; + break; + case IMode::INTER_CUBIC: + table_holder = &sm_tab_cubic; + ksize = 4; + break; + case IMode::INTER_LANCZOS4: + table_holder = &sm_tab_lanczos4; + ksize = 8; + break; + default: + megdnn_throw(("unsupported interpolation mode")); + } + std::lock_guard lg{table_holder->mtx}; + + float* tab = nullptr; + short* itab = nullptr; + if (!table_holder->get(&tab, &itab)) { + float _tab[8 * INTER_TAB_SIZE]; + int i, j, k1, k2; + init_inter_tab_1d(imode, _tab, INTER_TAB_SIZE); + for (i = 0; i < INTER_TAB_SIZE; ++i) { + for (j = 0; j < INTER_TAB_SIZE; + ++j, tab += ksize * ksize, itab += ksize * ksize) { + int isum = 0; + for (k1 = 0; k1 < ksize; ++k1) { + float vy = _tab[i * ksize + k1]; + for (k2 = 0; k2 < ksize; ++k2) { + float v = vy * _tab[j * ksize + k2]; + tab[k1 * ksize + k2] = v; + isum += itab[k1 * ksize + k2] = saturate_cast( + v * INTER_REMAP_COEF_SCALE); + } + } + if (isum != INTER_REMAP_COEF_SCALE) { + int diff = isum - INTER_REMAP_COEF_SCALE; + int ksize2 = ksize / 2, Mk1 = ksize2, Mk2 = ksize2; + int mk1 = ksize2, mk2 = ksize2; + for (k1 = ksize2; k1 < ksize2 + 2; ++k1) + for (k2 = ksize2; k2 < ksize2 + 2; ++k2) { + if (itab[k1 * ksize + k2] < + itab[mk1 * ksize + mk2]) { + mk1 = k1; + mk2 = k2; + } else if (itab[k1 * ksize + k2] > + itab[Mk1 * ksize + Mk2]) { + Mk1 = k1; + Mk2 = k2; + } + } + if (diff < 0) + itab[Mk1 * ksize + Mk2] = + (short)(itab[Mk1 * ksize + Mk2] - diff); + else + itab[mk1 * ksize + mk2] = + (short)(itab[mk1 * ksize + mk2] - diff); + } + } + } + tab -= INTER_TAB_SIZE2 * ksize * ksize; + itab -= INTER_TAB_SIZE2 * ksize * ksize; + +#if MEGDNN_X86 + if (imode == IMode::INTER_LINEAR) { + int16_t* bilineartab_ic4_buf = + sm_tab_linear.table->bilineartab_ic4_buf; + for (i = 0; i < INTER_TAB_SIZE2; i++) + for (j = 0; j < 4; j++) { + bilineartab_ic4_buf[i * 2 * 8 + 0 * 8 + j * 2] = + itab[i * ksize * ksize + 0 * ksize + 0]; + bilineartab_ic4_buf[i * 2 * 8 + 0 * 8 + j * 2 + 1] = + itab[i * ksize * ksize + 0 * ksize + 1]; + bilineartab_ic4_buf[i * 2 * 8 + 1 * 8 + j * 2] = + itab[i * ksize * ksize + 1 * ksize + 0]; + bilineartab_ic4_buf[i * 2 * 8 + 1 * 8 + j * 2 + 1] = + itab[i * ksize * ksize + 1 * ksize + 1]; + } + } +#endif + } + return fixpt ? static_cast(itab) : static_cast(tab); +} + +namespace megdnn { +namespace megcv { + +// explicit inst +template class InterpolationTable<5, 7, 15>; + +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/interp_helper.h b/dnn/src/common/cv/interp_helper.h new file mode 100644 index 00000000..9f9dcd85 --- /dev/null +++ b/dnn/src/common/cv/interp_helper.h @@ -0,0 +1,177 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/cv/interp_helper.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +#pragma once + +#include "src/common/cv/aligned_allocator.h" + +#include "megdnn/opr_param_defs.h" + +#include +#include +#include + +namespace megdnn { +namespace megcv { + +using InterpolationMode = megdnn::param::WarpPerspective::InterpolationMode; +using BorderMode = megdnn::param::WarpPerspective::BorderMode; + +/*! + * \brief helper for generating interpolation tables for different interpolation + * modes + */ +template +class InterpolationTable { +public: + using IMode = InterpolationMode; + + static constexpr int INTER_BITS = INTER_BITS_; + static constexpr int INTER_MAX = INTER_MAX_; + static constexpr int INTER_REMAP_COEF_BITS = INTER_REMAP_COEF_BITS_; + static constexpr int INTER_TAB_SIZE = (1 << INTER_BITS); + static constexpr int INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE; + static constexpr int INTER_REMAP_COEF_SCALE = 1 << INTER_REMAP_COEF_BITS; + + /*! + * \brief get interpolation table + * + * The table dimension is [INTER_TAB_SIZE][INTER_TAB_SIZE][ksize][ksize] + * + * \param imode interpolation mode + * \param fixpt if this is true, return a table for int16_t; else return a + * table for float + * \return table for int16 or float according to fixpt + */ + static const void* get_table(InterpolationMode imode, bool fixpt); +#if MEGDNN_X86 + /** + * \brief get interpolation table for linear mode. + * + * This current only avaiable in \warning X86. + * + * \return bilineartab_ic4_buf + */ + static const int16_t* get_linear_ic4_table(); +#endif + +private: + template + struct Table { + float ftab[INTER_TAB_SIZE2 * ksize * ksize]; + int16_t itab[INTER_TAB_SIZE2 * ksize * ksize]; +#if MEGDNN_X86 + alignas(128) int16_t bilineartab_ic4_buf[INTER_TAB_SIZE2 * 2 * 8]; + + static void* operator new(std::size_t sz) { + return ah::aligned_allocator().allocate(sz / + sizeof(Table)); + } + void operator delete(void* ptr) noexcept { + ah::aligned_allocator().deallocate( + reinterpret_cast(ptr), 0); + } +#endif + }; + + struct TableHolderBase { + std::mutex mtx; + + //! get table pointer; return whether already init + virtual bool get(float**, int16_t**) = 0; + + protected: + ~TableHolderBase() = default; + }; + + template + struct TableHolder final : public TableHolderBase { + std::unique_ptr> table; + + bool get(float** ftab, int16_t** itab) override { + bool ret = true; + if (!table) { + ret = false; + table.reset(new Table); + } + *ftab = table->ftab; + *itab = table->itab; + return ret; + } + }; + + static void init_inter_tab_1d(InterpolationMode imode, float* tab, + int tabsz); + + static inline void interpolate_linear(float x, float* coeffs); + static inline void interpolate_cubic(float x, float* coeffs); + static inline void interpolate_lanczos4(float x, float* coeffs); + + static TableHolder<2> sm_tab_linear; + static TableHolder<4> sm_tab_cubic; + static TableHolder<8> sm_tab_lanczos4; +}; + +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/linalg.h b/dnn/src/common/cv/linalg.h new file mode 100644 index 00000000..148b8611 --- /dev/null +++ b/dnn/src/common/cv/linalg.h @@ -0,0 +1,260 @@ +/** + * \file dnn/src/common/cv/linalg.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include +#include + +namespace megdnn { +namespace linalg { +/*! + * solve linear system Ax=b. note that @A and @b will be modified. result x is + * store in @b + */ +template +void solve(value_type* A, uint32_t n, value_type* b) { +#define AT(i, j) A[(i)*n + (j)] + + auto swap_row = [&](uint32_t i, uint32_t j, uint32_t start) { + if (i == j) + return; + for (size_t k = start; k < n; k++) + std::swap(AT(i, k), AT(j, k)); + std::swap(b[i], b[j]); + }; + + auto mult_row_scalar = [&](uint32_t row, value_type f, uint32_t start) { + for (size_t j = start; j < n; j++) + AT(row, j) *= f; + b[row] *= f; + }; + + for (uint32_t i = 0; i < n; i++) { + // swap the row which has the max absolute value to row i + uint32_t idx = i; + value_type max_abs_val = std::abs(AT(i, i)); + for (uint32_t j = i + 1; j < n; j++) { + value_type abs_val = std::abs(AT(j, i)); + if (abs_val > max_abs_val) { + max_abs_val = abs_val; + idx = j; + } + } + swap_row(i, idx, i); + + mult_row_scalar(i, value_type(1) / AT(i, i), i); + auto row_i = A + i * n; + for (uint32_t j = i + 1; j < n; j++) { + value_type factor = AT(j, i); + auto row_j = A + j * n; + + uint32_t k = i; + uint32_t repeat = (n - i) / 8; + uint32_t left = n - i - repeat * 8; + while (repeat--) { + row_j[k] -= row_i[k] * factor; + row_j[k + 1] -= row_i[k + 1] * factor; + row_j[k + 2] -= row_i[k + 2] * factor; + row_j[k + 3] -= row_i[k + 3] * factor; + row_j[k + 4] -= row_i[k + 4] * factor; + row_j[k + 5] -= row_i[k + 5] * factor; + row_j[k + 6] -= row_i[k + 6] * factor; + row_j[k + 7] -= row_i[k + 7] * factor; + k += 8; + } + + switch (left) { + case 7: + row_j[k + 6] -= row_i[k + 6] * factor; + case 6: + row_j[k + 5] -= row_i[k + 5] * factor; + case 5: + row_j[k + 4] -= row_i[k + 4] * factor; + case 4: + row_j[k + 3] -= row_i[k + 3] * factor; + case 3: + row_j[k + 2] -= row_i[k + 2] * factor; + case 2: + row_j[k + 1] -= row_i[k + 1] * factor; + case 1: + row_j[k] -= row_i[k] * factor; + case 0:; + } + + b[j] -= b[i] * factor; + } + } + + for (int i = int(n) - 1; i >= 0; i--) { + for (int j = i - 1; j >= 0; j--) { + b[j] -= b[i] * AT(j, i); + } + } +#undef AT +} + +template +void fill_eye(value_type* A, uint32_t n) { + memset(A, 0, n * n * sizeof(value_type)); + for (uint32_t i = 0; i < n; i++) + A[i * n + i] = 1; +} + +/*! + * compute the inverse of a matrix A and store it in B. A will be altered. + */ +template +void inverse_mat(value_type* A, value_type* B, uint32_t n) { +#define AT(A, i, j) A[(i)*n + (j)] + + auto swap_row = [&](value_type* A, uint32_t i, uint32_t j, uint32_t start) { + if (i == j) + return; + for (size_t k = start; k < n; k++) + std::swap(AT(A, i, k), AT(A, j, k)); + }; + + auto mult_row_scalar = [&](value_type* A, uint32_t row, value_type f, + uint32_t start) { + for (size_t j = start; j < n; j++) + AT(A, row, j) *= f; + }; + + auto vec_axpy = [](value_type a, value_type* x, value_type* y, uint32_t m) { + for (uint32_t i = 0; i < m; i++) + *(y++) += a * *(x++); + }; + + fill_eye(B, n); + + for (uint32_t i = 0; i < n; i++) { + // swap the row which has the max absolute value to row i + uint32_t idx = i; + value_type max_abs_val = std::abs(AT(A, i, i)); + for (uint32_t j = i + 1; j < n; j++) { + value_type abs_val = std::abs(AT(A, j, i)); + if (abs_val > max_abs_val) { + max_abs_val = abs_val; + idx = j; + } + } + swap_row(A, i, idx, 0); + swap_row(B, i, idx, 0); + + value_type scale = value_type(1) / AT(A, i, i); + + mult_row_scalar(A, i, scale, i); + mult_row_scalar(B, i, scale, 0); + + auto A_row_i = A + i * n, B_row_i = B + i * n; + for (uint32_t j = i + 1; j < n; j++) { + value_type factor = AT(A, j, i); + auto A_row_j = A + j * n, B_row_j = B + j * n; + vec_axpy(-factor, A_row_i + i, A_row_j + i, n - i); + vec_axpy(-factor, B_row_i, B_row_j, n); + } + } + + for (int i = int(n) - 1; i >= 0; i--) { + for (int j = i - 1; j >= 0; j--) { + value_type factor = -AT(A, j, i); + // vec_axpy(factor, A + i * n, A + j * n, n); + vec_axpy(factor, B + i * n, B + j * n, n); + } + } +#undef AT +} + +/// C = A * B +/// A, B must point to memory space different from C +template +void mat_mult(const value_type* A, const value_type* B, value_type* C, + uint32_t n) { +#define AT(A, i, j) A[(i)*n + (j)] + memset(C, 0, n * n * sizeof(value_type)); + for (uint32_t k = 0; k < n; k++) { + for (uint32_t i = 0; i < n; i++) + for (uint32_t j = 0; j < n; j++) + AT(C, i, j) += AT(A, i, k) * AT(B, k, j); + } +#undef AT +} + +template +void transpose_mat(const value_type* A, value_type* B, uint32_t rows, + uint32_t cols) { + for (uint32_t i = 0; i < rows; i++) + for (uint32_t j = 0; j < cols; j++) + B[j * rows + i] = A[i * cols + j]; +} + +/*! + * C_{dim0xdim2} = A_{dim0xdim1} * B_{dim1xdim2} + */ +template +void mat_mult_non_square(const value_type* A, const value_type* B, + value_type* C, uint8_t dim0, uint32_t dim1, + uint32_t dim2) { + memset(C, 0, dim0 * dim2 * sizeof(value_type)); + for (uint32_t k = 0; k < dim1; k++) + for (uint32_t i = 0; i < dim0; i++) + for (uint32_t j = 0; j < dim2; j++) + C[i * dim2 + j] += A[i * dim1 + k] * B[k * dim2 + j]; +} + +/*! + * A^{+}_{nxm} = (A^TA)^{-1}A^T + * where n = rows, m = cols. + * + * result will be stored back to A + * + * @param A sizeof rows*cols + * @param buf sizeof (rows + cols + cols) * cols + */ +template +void pseudo_inverse_mat(value_type* A, uint32_t rows, uint32_t cols, + value_type* buf) { + uint32_t &n = rows, &m = cols; + + value_type *B = buf, // m x n, A^T + *C = buf + n * m, // m x m, (A^TA) + *D = buf + n * m + m * m; // m x m, (A^TA)^{-1} + + transpose_mat(A, B, n, m); + mat_mult_non_square(B, A, C, m, n, m); + inverse_mat(C, D, m); + mat_mult_non_square(D, B, A, m, m, n); +} + +/*! + * solve linear system Ax=b with squre-loss using pseudo inverse matrix. + * + * @param A rows x cols, will be altered + * @param b rows x 1 + * @param x cols x 1 + * @param buf buffer used by pseudo_inverse_mat. see doc for pseudo_inverse_mat + * for detail. + */ +template +void solve_pseudo(value_type* A, uint32_t rows, uint32_t cols, + const value_type* b, value_type* x, value_type* buf) { + pseudo_inverse_mat(A, rows, cols, buf); + // A is actual A^{+} now + mat_mult_non_square(A, b, x, cols, rows, 1); +} + +} // namespace linalg +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cv/mat.cpp b/dnn/src/common/cv/mat.cpp new file mode 100644 index 00000000..f44eb4f6 --- /dev/null +++ b/dnn/src/common/cv/mat.cpp @@ -0,0 +1,363 @@ +/** + * \file dnn/src/common/cv/mat.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/basic_types.h" +#include "src/common/cv/common.h" +#include "src/common/cv/helper.h" + +#ifdef MEGDNN_CC_CUDA +#include "src/cuda/utils.cuh" +#endif + +namespace megdnn { +namespace megcv { + +#ifdef MEGDNN_CC_CUDA + +template +Mat::Mat(size_t rows, size_t cols, size_t channels, size_t step) + : m_rows(rows), + m_cols(cols), + m_channels(channels), + m_step(step), + m_offset(0) { + megdnn_assert(step >= cols * channels); + megdnn_assert(1 <= channels && channels <= 4); + T* raw_data; + cuda_check(cudaMalloc((void**)&raw_data, sizeof(T) * rows * step)); + m_data = + std::shared_ptr(raw_data, [](T* d) { cuda_check(cudaFree(d)); }); + cudaMemset(m_data.get(), 0, sizeof(T) * rows * step); +} + +template +Mat::Mat(size_t rows, size_t cols, size_t channels) + : Mat(rows, cols, channels, cols * channels) {} + +template +Mat::Mat(size_t rows, size_t cols, size_t channels, T* data) + : m_rows(rows), + m_cols(cols), + m_channels(channels), + m_step(cols * channels), + m_data(data, [](T*) {}), + m_offset(0) {} + +template +Mat::Mat(const Mat& rhs) + : m_rows(rhs.m_rows), + m_cols(rhs.m_cols), + m_channels(rhs.m_channels), + m_step(rhs.m_step), + m_data(rhs.m_data), + m_offset(0) {} + +template +Mat::Mat(const Mat& rhs, size_t row_offset, size_t row_count, + size_t col_offset, size_t col_count) + : m_rows(row_count), + m_cols(col_count), + m_channels(rhs.m_channels), + m_step(rhs.m_step), + m_data(rhs.m_data), + m_offset(rhs.m_offset + row_offset * m_step + + col_offset * m_channels) {} + +template +Mat& Mat::operator=(const Mat& rhs) { + this->m_rows = rhs.m_rows; + this->m_cols = rhs.m_cols; + this->m_channels = rhs.m_channels; + this->m_step = rhs.m_step; + this->m_data = rhs.m_data; + this->m_offset = rhs.m_offset; + return *this; +} + +template +T& Mat::at(size_t r, size_t c, size_t ch) { + megdnn_assert(r < m_rows); + megdnn_assert(c < m_cols); + megdnn_assert(ch < m_channels); + return ptr(r)[c * m_channels + ch]; +} + +template +const T& Mat::at(size_t r, size_t c, size_t ch) const { + megdnn_assert(r < m_rows); + megdnn_assert(c < m_cols); + megdnn_assert(ch < m_channels); + return ptr(r)[c * m_channels + ch]; +} + +template +Mat Mat::clone() const { + Mat res(m_rows, m_cols, m_channels); + for (size_t r = 0; r < m_rows; ++r) { + cuda_check(cudaMemcpy(res.ptr(r), this->ptr(r), + sizeof(T) * m_cols * m_channels, + cudaMemcpyDeviceToDevice)); + } + return res; +} + +template +bool Mat::equals(const Mat& rhs) const { + if (this->m_rows != rhs.m_rows) + return false; + if (this->m_cols != rhs.m_cols) + return false; + if (this->m_channels != rhs.m_channels) + return false; + T* row1 = new T[m_cols * m_channels]; + T* row2 = new T[m_cols * m_channels]; + megdnn_assert(row1); + megdnn_assert(row2); + for (size_t r = 0; r < m_rows; ++r) { + cuda_check(cudaMemcpy(row1, this->ptr(r), + sizeof(T) * m_cols * m_channels, + cudaMemcpyDeviceToHost)); + cuda_check(cudaMemcpy(row2, rhs.ptr(r), sizeof(T) * m_cols * m_channels, + cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < m_cols * m_channels; ++i) { + if (row1[i] != row2[i]) + return false; + } + } + delete[] row1; + delete[] row2; + return true; +} + +template +bool Mat::is_continuous() const { + return m_step == m_cols * m_channels; +} + +template +void Mat::read(const T* src) { + megdnn_assert(is_continuous()); + cuda_check(cudaMemcpy(m_data.get(), src, sizeof(T) * this->total_nr_elem(), + cudaMemcpyHostToDevice)); +} + +template +void Mat::write(T* dst) const { + megdnn_assert(is_continuous()); + cuda_check(cudaMemcpy(dst, m_data.get(), sizeof(T) * this->total_nr_elem(), + cudaMemcpyDeviceToHost)); +} + +template class Mat; +template class Mat; +template class Mat; +template class Mat; +template class Mat; + +#else + +template +Mat::Mat() + : m_rows(0), + m_cols(0), + m_channels(0), + m_step(0), + m_data(nullptr), + m_offset(0) {} + +template +Mat::Mat(size_t rows, size_t cols, size_t channels, size_t step) + : m_rows(rows), + m_cols(cols), + m_channels(channels), + m_step(step), + m_data(new T[rows * step], [](T* d) { delete[] d; }), + m_offset(0) { + megdnn_assert(step >= cols * channels); + megdnn_assert(1 <= channels && channels <= 4); + memset(m_data.get(), 0, sizeof(T) * rows * step); +} + +template +Mat TensorND2Mat(const TensorND& tensor, size_t batch) { + size_t m_rows = tensor.layout.shape[1]; + size_t m_cols = tensor.layout.shape[2]; + size_t m_channels = tensor.layout.shape[3]; + size_t m_step = tensor.layout.stride[1]; + T* data = ((T*)tensor.ptr()) + m_step * m_rows * batch; + + Mat mat(m_rows, m_cols, m_channels, m_step, data); + return mat; +} + +template <> +Mat TensorND2Mat(const TensorND& tensor, size_t batch) { + size_t m_rows = tensor.layout.shape[1]; + size_t m_cols = tensor.layout.shape[2]; + size_t m_channels = tensor.layout.shape[3]; + size_t m_step = tensor.layout.stride[1]; + + int* data = tensor.ptr() + m_step * m_rows * batch; + + Mat mat(m_rows, m_cols, m_channels, m_step, data); + return mat; +} + +template <> +Mat TensorND2Mat(const TensorND& tensor, size_t batch) { + size_t m_rows = tensor.layout.shape[1]; + size_t m_cols = tensor.layout.shape[2]; + size_t m_channels = tensor.layout.shape[3]; + size_t m_step = tensor.layout.stride[1]; + float* data = tensor.ptr() + m_step * m_rows * batch; + // m_data = std::shared_ptr(data, [](T *) {}); + + Mat mat(m_rows, m_cols, m_channels, m_step, data); + return mat; +} + +template <> +Mat TensorND2Mat(const TensorND& tensor, size_t batch) { + size_t m_rows = tensor.layout.shape[1]; + size_t m_cols = tensor.layout.shape[2]; + size_t m_channels = tensor.layout.shape[3]; + size_t m_step = tensor.layout.stride[1]; + uchar* data = tensor.ptr() + m_step * m_rows * batch; + // m_data = std::shared_ptr(data, [](T *) {}); + + Mat mat(m_rows, m_cols, m_channels, m_step, data); + return mat; +} + +template +Mat::Mat(size_t rows, size_t cols, size_t channels) + : Mat(rows, cols, channels, cols * channels) {} + +template +Mat::Mat(size_t rows, size_t cols, size_t channels, T* data) + : m_rows(rows), + m_cols(cols), + m_channels(channels), + m_step(cols * channels), + m_data(data, [](T*) {}), + m_offset(0) {} + +template +Mat::Mat(size_t rows, size_t cols, size_t channels, size_t step, T* data) + : m_rows(rows), + m_cols(cols), + m_channels(channels), + m_step(step), + m_data(data, [](T*) {}), + m_offset(0) {} + +template +Mat::Mat(const Mat& rhs) + : m_rows(rhs.m_rows), + m_cols(rhs.m_cols), + m_channels(rhs.m_channels), + m_step(rhs.m_step), + m_data(rhs.m_data), + m_offset(0) {} + +template +Mat::Mat(const Mat& rhs, size_t row_offset, size_t row_count, + size_t col_offset, size_t col_count) + : m_rows(row_count), + m_cols(col_count), + m_channels(rhs.m_channels), + m_step(rhs.m_step), + m_data(rhs.m_data), + m_offset(rhs.m_offset + row_offset * m_step + + col_offset * m_channels) {} + +template +Mat& Mat::operator=(const Mat& rhs) { + this->m_rows = rhs.m_rows; + this->m_cols = rhs.m_cols; + this->m_channels = rhs.m_channels; + this->m_step = rhs.m_step; + this->m_data = rhs.m_data; + this->m_offset = rhs.m_offset; + return *this; +} + +template +T& Mat::at(size_t r, size_t c, size_t ch) { + megdnn_assert(r < m_rows); + megdnn_assert(c < m_cols); + megdnn_assert(ch < m_channels); + return ptr(r)[c * m_channels + ch]; +} + +template +const T& Mat::at(size_t r, size_t c, size_t ch) const { + megdnn_assert(r < m_rows); + megdnn_assert(c < m_cols); + megdnn_assert(ch < m_channels); + return ptr(r)[c * m_channels + ch]; +} + +template +Mat Mat::clone() const { + Mat res(m_rows, m_cols, m_channels); + for (size_t r = 0; r < m_rows; ++r) { + memcpy(res.ptr(r), this->ptr(r), sizeof(T) * m_cols * m_channels); + } + return res; +} + +template +bool Mat::equals(const Mat& rhs) const { + if (this->m_rows != rhs.m_rows) + return false; + if (this->m_cols != rhs.m_cols) + return false; + if (this->m_channels != rhs.m_channels) + return false; + for (size_t r = 0; r < m_rows; ++r) { + if (0 != + memcmp(this->ptr(r), rhs.ptr(r), sizeof(T) * m_cols * m_channels)) + return false; + } + return true; +} + +template +bool Mat::is_continuous() const { + return m_step == m_cols * m_channels; +} + +template +void Mat::read(const T* src) { + megdnn_assert(is_continuous()); + memcpy(m_data.get(), src, sizeof(T) * this->total_nr_elem()); +} + +template +void Mat::write(T* dst) const { + megdnn_assert(is_continuous()); + memcpy(dst, m_data.get(), sizeof(T) * this->total_nr_elem()); +} + +template class Mat; +template class Mat; +template class Mat; +template class Mat; +template class Mat; +template class Mat; + +#endif + +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/cvt_color.cpp b/dnn/src/common/cvt_color.cpp new file mode 100644 index 00000000..8f35677e --- /dev/null +++ b/dnn/src/common/cvt_color.cpp @@ -0,0 +1,166 @@ +/** + * \file dnn/src/common/cvt_color.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void CvtColorBase::deduce_layout_fwd(const TensorLayout& src, + TensorLayout& dst) { + auto errmsg = [&]() { return megdnn_layout_msg(src); }; + MEGDNN_MARK_USED_VAR(errmsg); + + auto mode = param().mode; + if (mode == Param::Mode::YUV2RGB_NV21 || + mode == Param::Mode::YUV2BGR_NV21 || + mode == Param::Mode::YUV2RGB_NV12 || + mode == Param::Mode::YUV2BGR_NV12 || + mode == Param::Mode::YUV2RGB_YV12 || + mode == Param::Mode::YUV2BGR_YV12 || + mode == Param::Mode::YUV2RGB_YU12 || + mode == Param::Mode::YUV2BGR_YU12) { + megdnn_log_warn( + "Deprecated mode for cvtcolor, you should refer to the wiki " + "for detail usage"); + } + //! The origin YUV is YCrCb in opencv as histrical reasons, it will remove + //! later + if (mode == Param::Mode::YUV2RGB_NV21) { + mode = Param::Mode::YCrCb2RGB; + } + if (mode == Param::Mode::YUV2BGR_NV21) { + mode = Param::Mode::YCrCb2BGR; + } + + megdnn_assert( + src.ndim == 4_z && (src.shape[3] == 1_z || src.shape[3] == 3_z || + src.shape[3] == 4_z), + "%s", errmsg().c_str()); + + size_t in = src.shape[0]; + size_t ih = src.shape[1]; + size_t iw = src.shape[2]; + size_t ic = src.shape[3]; + + size_t oc = 1; + size_t oh = ih; + size_t ow = iw; + + switch (mode) { + case Param::Mode::RGB2GRAY: + megdnn_assert(ic == 3); + oc = 1; + break; + case Param::Mode::RGB2YUV: + megdnn_assert(ic == 3); + oc = 3; + break; + case Param::Mode::YUV2RGB: + megdnn_assert(ic == 3); + oc = 3; + break; + case Param::Mode::GRAY2RGB: + megdnn_assert(ic == 1); + oc = 3; + break; + case Param::Mode::RGBA2RGB: + megdnn_assert(ic == 4); + oc = 3; + break; + case Param::Mode::RGBA2BGR: + megdnn_assert(ic == 4); + oc = 3; + break; + case Param::Mode::RGBA2GRAY: + megdnn_assert(ic == 4); + oc = 1; + break; + case Param::Mode::RGB2BGR: + megdnn_assert(ic == 3); + oc = 3; + break; + case Param::Mode::BGR2GRAY: + megdnn_assert(ic == 3); + oc = 1; + break; + case Param::Mode::BGR2RGB: + megdnn_assert(ic == 3); + oc = 3; + break; + case Param::Mode::YUV2GRAY_NV21: + case Param::Mode::YUV2GRAY_NV12: + megdnn_assert(ic == 1 && ih % 3 == 0 && iw % 2 == 0); + oh = ih / 3 * 2; + oc = 1; + break; + case Param::Mode::YUV2GRAY_YV12: + case Param::Mode::YUV2GRAY_YU12: + megdnn_assert(ic == 1 && ih % 6 == 0 && iw % 2 == 0); + oh = ih / 3 * 2; + oc = 1; + break; + case Param::Mode::YCrCb2BGR: + case Param::Mode::YCrCb2RGB: + case Param::Mode::YUV2RGB_NV21: + case Param::Mode::YUV2RGB_NV12: + case Param::Mode::YUV2BGR_NV21: + case Param::Mode::YUV2BGR_NV12: + case Param::Mode::BT601_YUV2RGB_NV21: + case Param::Mode::BT601_YUV2RGB_NV12: + case Param::Mode::BT601_YUV2BGR_NV21: + case Param::Mode::BT601_YUV2BGR_NV12: + megdnn_assert(ic == 1 && ih % 3 == 0 && iw % 2 == 0); + oh = ih / 3 * 2; + oc = 3; + break; + case Param::Mode::YUV2RGB_YV12: + case Param::Mode::YUV2RGB_YU12: + case Param::Mode::YUV2BGR_YV12: + case Param::Mode::YUV2BGR_YU12: + case Param::Mode::BT601_YUV2RGB_YV12: + case Param::Mode::BT601_YUV2RGB_YU12: + case Param::Mode::BT601_YUV2BGR_YV12: + case Param::Mode::BT601_YUV2BGR_YU12: + megdnn_assert(ic == 1 && ih % 6 == 0 && iw % 2 == 0); + oh = ih / 3 * 2; + oc = 3; + break; + default: + megdnn_throw("Can not find property cvt_color operator."); + } + + dst = TensorLayout(TensorShape({in, oh, ow, oc}), src.dtype); +} + +void CvtColorBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& dst) { + megdnn_assert_eq_dtype(src, dst); + TensorLayout dst_expected; + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_shape(dst_expected, dst); +} + +void CvtColor::deduce_layout(const TensorLayout& src, TensorLayout& dst) { + deduce_layout_fwd(src, dst); +} + +void CvtColor::check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, dst); + megdnn_assert_contiguous(src); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/deformable_conv.cpp b/dnn/src/common/deformable_conv.cpp new file mode 100644 index 00000000..d68dff88 --- /dev/null +++ b/dnn/src/common/deformable_conv.cpp @@ -0,0 +1,272 @@ +/** + * \file dnn/src/common/deformable_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +using namespace megdnn; + +using CanonizedFilterMeta = DeformableConvBase::CanonizedFilterMeta; + +namespace { + +template +std::string get_errmsg(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& dst, const Param& param) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + MEGDNN_MARK_USED_VAR(dst); + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(filter) + ", " + + megdnn_layout_msg(offset) + ", " + megdnn_layout_msg(mask) + ", " + + megdnn_layout_msg(dst) + ", " + megdnn_mangle("only support nchw") + + ", " + megdnn_mangle("group=") + std::to_string(param.group) + ", " + + megdnn_mangle("deformable_group=") + + std::to_string(param.deformable_group) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param.pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param.pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param.stride_h) + ", " + + megdnn_mangle("stride_w=") + std::to_string(param.stride_w) + ", " + + megdnn_mangle("dilate_h=") + std::to_string(param.dilate_h) + ", " + + megdnn_mangle("dilate_w=") + std::to_string(param.dilate_w); +} + +template +void make_canonized_filter_meta_nchw(size_t src_ndim, + const TensorLayout& filter, + const Param& param, + CanonizedFilterMeta& ret) { + megdnn_assert(param.mode == Param::Mode::CROSS_CORRELATION, + "only support CROSS_CORRELATION mode"); + + megdnn_assert(param.format == Param::Format::NCHW, + "only support nchw input layout"); + + size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; + + flt_start = 0, flt_spatial_start = 2; + ocpg_pos = 0, icpg_pos = 1; + + if (param.sparse == Param::Sparse::GROUP) + flt_start = 1; + + ret.spatial_ndim = src_ndim - 2; + + megdnn_assert( + ret.spatial_ndim == 2, + "only 2D convolution is supported, and imput should be 4-dim; " + "got input dim = %zu", + src_ndim); + + ret.ocpg = filter[flt_start + ocpg_pos]; + ret.icpg = filter[flt_start + icpg_pos]; + + auto dilation = ret.dilation; + + for (size_t i = 0; i < ret.spatial_ndim; ++i) { + megdnn_assert(dilation[i] > 0, + "invalid dilation on spatial dim %zu, %u", i, + dilation[i]); + ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; + ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; + } +} + +} // namespace + +namespace megdnn { + +CanonizedFilterMeta DeformableConvBase::make_canonized_filter_meta( + size_t src_ndim, const TensorLayout& filter, + const TensorLayout& offset) const { + megdnn_assert_contiguous(filter); + + CanonizedFilterMeta ret; + ret.group = 1; + ret.dtype = filter.dtype; + ret.stride[0] = param().stride_h; + ret.stride[1] = param().stride_w; + ret.padding[0] = param().pad_h; + ret.padding[1] = param().pad_w; + ret.dilation[0] = param().dilate_h; + ret.dilation[1] = param().dilate_w; + + if (param().sparse == Param::Sparse::GROUP) { + megdnn_assert(filter.ndim == 5, + "filter dim should be 5 for group conv"); + ret.group = filter[0]; + } + + make_canonized_filter_meta_nchw(src_ndim, filter, param(), ret); + + auto fh = ret.spatial[0]; + auto fw = ret.spatial[1]; + + ret.deformable_group = offset[1] / (2 * fh * fw); + + return ret; +} + +void DeformableConvBase::deduce_layout_fwd(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + TensorLayout& dst) { + // im shape: (n, IC, IH, IW) + megdnn_assert(im.ndim == 4, "invalid src layout: %s", + megdnn_layout_msg(im).c_str()); + // filter shape: (OC, IC, FH, FW) or (g, OC/g, IC/g, FH, FW) + megdnn_assert(filter.ndim == 4 || filter.ndim == 5, + "invalid filter layout: %s", + megdnn_layout_msg(filter).c_str()); + // offset shape: (N, 2*dg*FH*FW, OH, OW) + megdnn_assert(offset.ndim == 4, "invalid offset layout: %s", + megdnn_layout_msg(offset).c_str()); + // mask shape: (N, dg*FH*FW, OH, OW) + megdnn_assert(mask.ndim == 4, "invalid mask layout: %s", + megdnn_layout_msg(mask).c_str()); + + size_t n = im.shape[0], ic = im.shape[1]; + size_t ih = im.shape[2], iw = im.shape[3]; + size_t dh = param().dilate_h, dw = param().dilate_w; + size_t ph = param().pad_h, pw = param().pad_w; + size_t sh = param().stride_h, sw = param().stride_w; + + auto&& fm = make_canonized_filter_meta(im.ndim, filter, offset); + size_t fh = fm.spatial[0], fw = fm.spatial[1]; + + size_t kh = 1 + (fh - 1) * dh; + size_t kw = 1 + (fw - 1) * dw; + + size_t group = fm.group; + size_t deformable_group = fm.deformable_group; + + size_t icpg = fm.icpg, ocpg = fm.ocpg; + size_t oc = group * ocpg; + size_t oh = (ih + ph * 2 - kh) / sh + 1; + size_t ow = (iw + pw * 2 - kw) / sw + 1; + + megdnn_assert(group > 0 && deformable_group > 0, + "group and deformable group should > 0"); + megdnn_assert(ic == icpg * group, "im ic != group * icpg of filter"); + megdnn_assert(ic % deformable_group == 0, "ic %% deformable_group != 0"); + megdnn_assert(oc % deformable_group == 0, "oc %% deformable_group != 0"); + + megdnn_assert( + (offset[1] % (2 * fh * fw) == 0) && (mask[1] % (fh * fw) == 0), + "invalid deformable group deduced from offset(%s) or mask(%s)", + megdnn_layout_msg(offset).c_str(), megdnn_layout_msg(mask).c_str()); + + megdnn_assert((offset[1] / (2 * fh * fw)) == (mask[1] / (fh * fw)), + "offset(%s) and mask(%s) should have same deformable group", + megdnn_layout_msg(offset).c_str(), + megdnn_layout_msg(mask).c_str()); + + megdnn_assert((offset[2] == mask[2]) && (offset[3] == mask[3]), + "offset(%s) and mask(%s) should have same spatial dim", + megdnn_layout_msg(offset).c_str(), + megdnn_layout_msg(mask).c_str()); + megdnn_assert(oh == offset[2], "deduced oh(%zu) != offset oh(%zu)", oh, + offset[2]); + megdnn_assert(ow == offset[3], "deduced ow(%zu) != offset ow(%zu)", ow, + offset[3]); + dst.ndim = 4; + + dst = {{n, oc, oh, ow}, im.dtype}; +} +void DeformableConvBase::check_layout_fwd(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst) { + auto& im_dtype = im.dtype; + TensorLayout dst_expected; + megdnn_assert(im_dtype.enumv() == DTypeEnum::Float32, + "DeformableConv only support float32 input"); + megdnn_assert_eq_dtype(im, dst); + megdnn_assert_eq_dtype(im, filter); + megdnn_assert_eq_dtype(im, dst); + megdnn_assert_eq_dtype(im, offset); + megdnn_assert_eq_dtype(im, mask); + deduce_layout_fwd(im, filter, offset, mask, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); +} + +void DeformableConvForward::deduce_layout(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + TensorLayout& dst) { + deduce_layout_fwd(im, filter, offset, mask, dst); + return; +} + +CanonizedFilterMeta DeformableConvForward::check_exec( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& dst, size_t workspace_in_bytes) { + auto ret = make_canonized_filter_meta(im.ndim, filter, offset); + auto required_workspace_in_bytes = + get_workspace_in_bytes(im, filter, offset, mask, dst); + check_layout_fwd(im, filter, offset, mask, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +CanonizedFilterMeta DeformableConvBackwardFilter::check_exec( + const TensorLayout& im, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + const TensorLayout& filter_grad, size_t workspace_in_bytes) { + check_layout_fwd(im, filter_grad, offset, mask, out_grad); + // check dtype + megdnn_assert_eq_dtype(im, filter_grad); + + auto ret = make_canonized_filter_meta(im.ndim, filter_grad, offset); + auto required_workspace_in_bytes = + get_workspace_in_bytes(im, offset, mask, out_grad, filter_grad); + + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +CanonizedFilterMeta DeformableConvBackwardData::check_exec( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad, + size_t workspace_in_bytes) { + check_layout_fwd(im, filter, offset, mask, out_grad); + + // check dtype + megdnn_assert_eq_dtype(im, im_grad); + megdnn_assert_eq_dtype(im, offset_grad); + megdnn_assert_eq_dtype(im, mask_grad); + + // check layout + megdnn_assert(im.shape == im_grad.shape, "invalid im_grad shape: %s", + megdnn_layout_msg(im_grad).c_str()); + megdnn_assert(offset.shape == offset_grad.shape, + "invalid offset_grad shape: %s", + megdnn_layout_msg(offset_grad).c_str()); + megdnn_assert(mask.shape == mask_grad.shape, "invalid mask_grad shape: %s", + megdnn_layout_msg(mask_grad).c_str()); + + auto ret = make_canonized_filter_meta(im.ndim, filter, offset); + auto required_workspace_in_bytes = + get_workspace_in_bytes(im, filter, offset, mask, out_grad, im_grad, + offset_grad, mask_grad); + + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/deformable_ps_roi_pooling.cpp b/dnn/src/common/deformable_ps_roi_pooling.cpp new file mode 100644 index 00000000..4107ee64 --- /dev/null +++ b/dnn/src/common/deformable_ps_roi_pooling.cpp @@ -0,0 +1,113 @@ +/** + * \file dnn/src/common/deformable_ps_roi_pooling.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +namespace megdnn { + +void DeformablePSROIPoolingBase::deduce_layout_fwd(const TensorLayout& data, + const TensorLayout& rois, + const TensorLayout& trans, + TensorLayout& out_data, + TensorLayout& out_count) { + megdnn_assert_contiguous(data); + megdnn_assert_contiguous(rois); + megdnn_assert_contiguous(trans); + + auto errmsg = [&]() { + return std::string("data: ") + megdnn_layout_msg(data) + + ", rois: " + megdnn_layout_msg(rois) + + ", trans: " + megdnn_layout_msg(trans) + + ", out_data: " + megdnn_layout_msg(out_data) + + ", out_count: " + megdnn_layout_msg(out_count); + }; + + MEGDNN_MARK_USED_VAR(data); + MEGDNN_MARK_USED_VAR(rois); + MEGDNN_MARK_USED_VAR(trans); + MEGDNN_MARK_USED_VAR(out_data); + MEGDNN_MARK_USED_VAR(out_count); + MEGDNN_MARK_USED_VAR(out_count); + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert(data.dtype.enumv() == DTypeEnum::Float32, + "DeformablePSROIPooling only support float32 input"); + megdnn_assert(data.ndim == 4_z, "invalid data shape, %s", errmsg().c_str()); + megdnn_assert(rois.ndim == 2_z && rois[1] == 5, "invalid rois shape, %s", + errmsg().c_str()); + megdnn_assert(trans.ndim == 4_z, "invalid trans shape, %s", + errmsg().c_str()); + + if (!param().no_trans) { + megdnn_assert(trans[1] == 2_z && trans[2] == param().pooled_h && + trans[3] == param().pooled_w, + "invalid trans shape: %s", errmsg().c_str()); + } + + out_data = {{rois[0], data[1], param().pooled_h, param().pooled_w}, + data.dtype}; + out_count = out_data; +} + +void DeformablePSROIPoolingBase::check_layout_fwd(const TensorLayout& data, + const TensorLayout& rois, + const TensorLayout& trans, + const TensorLayout& out_data, + const TensorLayout& out_count, + size_t workspace_in_bytes) { + MEGDNN_MARK_USED_VAR(workspace_in_bytes); + + TensorLayout exp_out_data, exp_out_count; + deduce_layout_fwd(data, rois, trans, exp_out_data, exp_out_count); + + megdnn_assert_eq_layout(out_data, exp_out_data); + megdnn_assert_eq_layout(out_count, exp_out_count); +} + +void DeformablePSROIPoolingForward::deduce_layout(const TensorLayout& data, + const TensorLayout& rois, + const TensorLayout& trans, + TensorLayout& out_data, + TensorLayout& out_count) { + deduce_layout_fwd(data, rois, trans, out_data, out_count); +} + +void DeformablePSROIPoolingForward::check_exec(const TensorLayout& data, + const TensorLayout& rois, + const TensorLayout& trans, + const TensorLayout& out_data, + const TensorLayout& out_count, + size_t workspace_in_bytes) { + check_layout_fwd(data, rois, trans, out_data, out_count, + workspace_in_bytes); + auto required_workspace_in_bytes = + get_workspace_in_bytes(data, rois, trans, out_data, out_count); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void DeformablePSROIPoolingBackward::check_exec( + const TensorLayout& data, const TensorLayout& rois, + const TensorLayout& trans, const TensorLayout& out_diff, + const TensorLayout& out_count, const TensorLayout& data_diff, + const TensorLayout& trans_diff, size_t workspace_in_bytes) { + check_layout_fwd(data_diff, rois, trans_diff, out_diff, out_count, + workspace_in_bytes); + megdnn_assert_eq_layout(data, data_diff); + megdnn_assert_eq_layout(trans, trans_diff); + auto required_workspace_in_bytes = get_workspace_in_bytes( + data, rois, trans, out_diff, out_count, data_diff, trans_diff); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/dot.cpp b/dnn/src/common/dot.cpp new file mode 100644 index 00000000..993d50c9 --- /dev/null +++ b/dnn/src/common/dot.cpp @@ -0,0 +1,48 @@ +/** + * \file dnn/src/common/dot.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void DotForward::check_exec(const TensorLayout &A, + const TensorLayout &B, + const TensorLayout &C, + size_t workspace_in_bytes) +{ + auto errmsg = [&]() { + return megdnn_layout_msg(A) + + ", " + megdnn_layout_msg(B) + + ", " + megdnn_layout_msg(C); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert(A.ndim == 1_z && A.stride[0] >= 0, "%s", errmsg().c_str()); + megdnn_assert(B.ndim == 1_z && B.stride[0] >= 0, "%s", errmsg().c_str()); + megdnn_assert(A.shape[0] == B.shape[0], "%s", errmsg().c_str()); + megdnn_assert(C.is_scalar(), "%s", errmsg().c_str()); + + megdnn_assert(A.dtype == B.dtype && A.dtype == C.dtype); + + auto required_workspace_in_bytes = get_workspace_in_bytes(A, B, C); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void DotForward::deduce_layout(const TensorLayout &A, + const TensorLayout &, + TensorLayout &C) +{ + C = TensorLayout(TensorShape{1}, A.dtype); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/dtype.cpp b/dnn/src/common/dtype.cpp new file mode 100644 index 00000000..e0d0860f --- /dev/null +++ b/dnn/src/common/dtype.cpp @@ -0,0 +1,183 @@ +/** + * \file dnn/src/common/dtype.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/dtype.h" +#include "src/common/utils.h" + +#include +#include +#include + +using namespace megdnn; +using namespace dtype; + +#if MEGDNN_DISABLE_FLOAT16 +#pragma message "megdnn float16 disabled" +#endif + +#define IMPL(_name) \ +DType::Trait _name::sm_trait = { \ + DTypeTrait<_name>::name, \ + DTypeTrait<_name>::size_log, DTypeTrait<_name>::low_bit, \ + DTypeEnum::_name, \ + DTypeTrait<_name>::category, DTypeTrait<_name>::signedness, \ + DTypeTrait<_name>::has_param \ +}; +#define TEMPLATED_IMPL(_name) \ + template <> \ + IMPL(_name) + +MEGDNN_FOREACH_DTYPE_NAME(IMPL) +MEGDNN_FOREACH_PARAMETERIZED_DTYPE(TEMPLATED_IMPL) + +#undef TEMPLATED_IMPL +#undef IMPL + +void DType::on_assert_is_failed(const char *rname) const { + megdnn_throw(megdnn_mangle( + ssprintf("attempt to access dtype %s as %s", + name(), rname).c_str())); + MEGDNN_MARK_USED_VAR(rname); +} + +void DType::on_request_lowbit_size() const { + megdnn_throw(megdnn_mangle( + ssprintf("attempt to get size of lowbit dtype %s", name()))); +} + +DType DType::from_enum(DTypeEnum ev) { + switch (ev) { +#define cb(_dt) case DTypeEnum::_dt: return dtype::_dt(); + MEGDNN_FOREACH_DTYPE_NAME(cb) +#undef cb +#define cb(_dt) case DTypeEnum::_dt: + MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) + megdnn_throw(megdnn_mangle( + "cannot construct parameterized DType via DType::from_enum")); +#undef cb + } + megdnn_throw(megdnn_mangle("bad DTypeEnum value")); +} + +template +typename ParameterizedDType::Trait* +ParameterizedDType::make_from_param( + const DTypeParam& param) { + struct Hasher { + std::size_t operator()(const DTypeParam& key) const { + return key.hash(); + } + }; + static std::unordered_map, + std::unique_ptr, Hasher> + entries; + + auto it = entries.find(param); + if (it != entries.end()) { + return it->second.get(); + } + entries[param] = + std::make_unique(SelfType::sm_trait, param); + return entries[param].get(); +} + +// Instantize `make_from_param` for all parameterized DTypes. +#define inst(_name) \ + template _name::Trait* _name::make_from_param(const DTypeParam&); +MEGDNN_FOREACH_PARAMETERIZED_DTYPE(inst) +#undef inst + +DTypeParam::DTypeParamImpl(float scale, uint8_t zero_point) + : scale{scale}, zero_point{zero_point} { + //! As the nan is not equal to any value + megdnn_assert(!std::isnan(scale), "nan number compare is not support"); +} + +inline std::size_t DTypeParam::hash() const { + return std::hash()(scale) ^ std::hash()(zero_point); +} + +inline bool DTypeParam::operator==( + const DTypeParam& rhs) const { + return scale == rhs.scale && zero_point == rhs.zero_point; +} + +DTypeParam::DTypeParamImpl(float scale) : scale{scale} { + //! As the nan is not equal to any value + megdnn_assert(!std::isnan(scale), "nan number compare is not support"); +} + +inline std::size_t DTypeParam::hash() const { + return std::hash()(scale); +} + +inline bool DTypeParam::operator==( + const DTypeParam& rhs) const { + return scale == rhs.scale; +} + +DTypeParam::DTypeParamImpl(float scale) : scale{scale} { + //! As the nan is not equal to any value + megdnn_assert(!std::isnan(scale), "nan number compare is not support"); +} + +inline std::size_t DTypeParam::hash() const { + return std::hash()(scale); +} + +inline bool DTypeParam::operator==( + const DTypeParam& rhs) const { + return scale == rhs.scale; +} + +DTypeParam::DTypeParamImpl(float scale) : scale{scale} { + //! As the nan is not equal to any value + megdnn_assert(!std::isnan(scale), "nan number compare is not support"); +} + +inline std::size_t DTypeParam::hash() const { + return std::hash()(scale); +} + +inline bool DTypeParam::operator==( + const DTypeParam& rhs) const { + return scale == rhs.scale; +} + +DTypeParam::DTypeParamImpl(float scale, uint8_t zero_point) + : scale{scale}, zero_point{zero_point} { + //! As the nan is not equal to any value + megdnn_assert(!std::isnan(scale), "nan number compare is not support"); +} + +inline std::size_t DTypeParam::hash() const { + return std::hash()(scale) ^ std::hash()(zero_point); +} + +inline bool DTypeParam::operator==( + const DTypeParam& rhs) const { + return scale == rhs.scale && zero_point == rhs.zero_point; +} + +DTypeParam::DTypeParamImpl(float scale) : scale{scale} { + //! As the nan is not equal to any value + megdnn_assert(!std::isnan(scale), "nan number compare is not support"); +} + +inline std::size_t DTypeParam::hash() const { + return std::hash()(scale); +} + +inline bool DTypeParam::operator==( + const DTypeParam& rhs) const { + return scale == rhs.scale; +} +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/common/elemwise/each_mode.inl b/dnn/src/common/elemwise/each_mode.inl new file mode 100644 index 00000000..52fa48b9 --- /dev/null +++ b/dnn/src/common/elemwise/each_mode.inl @@ -0,0 +1,93 @@ +/** + * \file dnn/src/common/elemwise/each_mode.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_each_mode.py +#define MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_FLOAT(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(EXP, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(EXPM1, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LOG, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LOG1P, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIN, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ERF, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ERFINV, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) \ + +#define MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_INT(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) \ + +#define MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_FLOAT(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TRUE_DIV, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(POW, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LOG_SUM_EXP, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) \ + +#define MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_INT(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb) \ + +#define MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_FLOAT(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, cb) \ + +#define MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_INT(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) \ + diff --git a/dnn/src/common/elemwise/erfinv.h b/dnn/src/common/elemwise/erfinv.h new file mode 100644 index 00000000..7cf0b565 --- /dev/null +++ b/dnn/src/common/elemwise/erfinv.h @@ -0,0 +1,417 @@ +/** + * Boost Software License - Version 1.0 - August 17th, 2003 + * + * Permission is hereby granted, free of charge, to any person or organization + * obtaining a copy of the software and accompanying documentation covered by + * this license (the "Software") to use, reproduce, display, distribute, + * execute, and transmit the Software, and to prepare derivative works of the + * Software, and to permit third-parties to whom the Software is furnished to + * do so, all subject to the following: + * + * The copyright notices in the Software and this entire statement, including + * the above license grant, this restriction and the following disclaimer, + * must be included in all copies of the Software, in whole or in part, and + * all derivative works of the Software, unless such copies or derivative + * works are solely in the form of machine-executable object code generated by + * a source language processor. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * -------------------------------------------------------------------------- + * \file dnn/src/common/elemwise/erfinv.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * -------------------------------------------------------------------------- + */ + +#ifndef __CUDACC__ + +#include + +#include "src/common/utils.h" + +// (C) Copyright John Maddock 2006. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +template +inline U evaluate_polynomial(const T_* poly, U const& z, std::size_t count) +{ + megdnn_assert(count > 0); + U sum = static_cast(poly[count - 1]); + for(int i = static_cast(count) - 2; i >= 0; --i) + { + sum *= z; + sum += static_cast(poly[i]); + } + return sum; +} + +template +inline V evaluate_polynomial(const T(&a)[N], const V& val) +{ + return evaluate_polynomial(a, val, N); +} + +// +// The inverse erf and erfc functions share a common implementation, +// this version is for 80-bit long double's and smaller: +// +inline double erfinv_imp(double p, double q) +{ + using namespace std; + + double result = 0; + + if(p <= 0.5) + { + // + // Evaluate inverse erf using the rational approximation: + // + // x = p(p+10)(Y+R(p)) + // + // Where Y is a constant, and R(p) is optimised for a low + // absolute error compared to |Y|. + // + // double: Max error found: 2.001849e-18 + // long double: Max error found: 1.017064e-20 + // Maximum Deviation Found (actual error term at infinite precision) 8.030e-21 + // + static const float Y = 0.0891314744949340820313f; + static const double P[] = { + -0.000508781949658280665617, + -0.00836874819741736770379, + 0.0334806625409744615033, + -0.0126926147662974029034, + -0.0365637971411762664006, + 0.0219878681111168899165, + 0.00822687874676915743155, + -0.00538772965071242932965 + }; + static const double Q[] = { + 1.0, + -0.970005043303290640362, + -1.56574558234175846809, + 1.56221558398423026363, + 0.662328840472002992063, + -0.71228902341542847553, + -0.0527396382340099713954, + 0.0795283687341571680018, + -0.00233393759374190016776, + 0.000886216390456424707504 + }; + double g = p * (p + 10); + double r = evaluate_polynomial(P, p) / evaluate_polynomial(Q, p); + result = g * Y + g * r; + } + else if(q >= 0.25) + { + // + // Rational approximation for 0.5 > q >= 0.25 + // + // x = sqrt(-2*log(q)) / (Y + R(q)) + // + // Where Y is a constant, and R(q) is optimised for a low + // absolute error compared to Y. + // + // double : Max error found: 7.403372e-17 + // long double : Max error found: 6.084616e-20 + // Maximum Deviation Found (error term) 4.811e-20 + // + static const float Y = 2.249481201171875f; + static const double P[] = { + -0.202433508355938759655, + 0.105264680699391713268, + 8.37050328343119927838, + 17.6447298408374015486, + -18.8510648058714251895, + -44.6382324441786960818, + 17.445385985570866523, + 21.1294655448340526258, + -3.67192254707729348546 + }; + static const double Q[] = { + 1.0, + 6.24264124854247537712, + 3.9713437953343869095, + -28.6608180499800029974, + -20.1432634680485188801, + 48.5609213108739935468, + 10.8268667355460159008, + -22.6436933413139721736, + 1.72114765761200282724 + }; + double g = sqrt(-2 * log(q)); + double xs = q - 0.25f; + double r = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); + result = g / (Y + r); + } + else + { + // + // For q < 0.25 we have a series of rational approximations all + // of the general form: + // + // let: x = sqrt(-log(q)) + // + // Then the result is given by: + // + // x(Y+R(x-B)) + // + // where Y is a constant, B is the lowest value of x for which + // the approximation is valid, and R(x-B) is optimised for a low + // absolute error compared to Y. + // + // Note that almost all code will really go through the first + // or maybe second approximation. After than we're dealing with very + // small input values indeed: 80 and 128 bit long double's go all the + // way down to ~ 1e-5000 so the "tail" is rather long... + // + double x = sqrt(-log(q)); + if(x < 3) + { + // Max error found: 1.089051e-20 + static const float Y = 0.807220458984375f; + static const double P[] = { + -0.131102781679951906451, + -0.163794047193317060787, + 0.117030156341995252019, + 0.387079738972604337464, + 0.337785538912035898924, + 0.142869534408157156766, + 0.0290157910005329060432, + 0.00214558995388805277169, + -0.679465575181126350155e-6, + 0.285225331782217055858e-7, + -0.681149956853776992068e-9 + }; + static const double Q[] = { + 1.0, + 3.46625407242567245975, + 5.38168345707006855425, + 4.77846592945843778382, + 2.59301921623620271374, + 0.848854343457902036425, + 0.152264338295331783612, + 0.01105924229346489121 + }; + double xs = x - 1.125f; + double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); + result = Y * x + R * x; + } + else if(x < 6) + { + // Max error found: 8.389174e-21 + static const float Y = 0.93995571136474609375f; + static const double P[] = { + -0.0350353787183177984712, + -0.00222426529213447927281, + 0.0185573306514231072324, + 0.00950804701325919603619, + 0.00187123492819559223345, + 0.000157544617424960554631, + 0.460469890584317994083e-5, + -0.230404776911882601748e-9, + 0.266339227425782031962e-11 + }; + static const double Q[] = { + 1.0, + 1.3653349817554063097, + 0.762059164553623404043, + 0.220091105764131249824, + 0.0341589143670947727934, + 0.00263861676657015992959, + 0.764675292302794483503e-4 + }; + double xs = x - 3; + double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); + result = Y * x + R * x; + } + else if(x < 18) + { + // Max error found: 1.481312e-19 + static const float Y = 0.98362827301025390625f; + static const double P[] = { + -0.0167431005076633737133, + -0.00112951438745580278863, + 0.00105628862152492910091, + 0.000209386317487588078668, + 0.149624783758342370182e-4, + 0.449696789927706453732e-6, + 0.462596163522878599135e-8, + -0.281128735628831791805e-13, + 0.99055709973310326855e-16 + }; + static const double Q[] = { + 1.0, + 0.591429344886417493481, + 0.138151865749083321638, + 0.0160746087093676504695, + 0.000964011807005165528527, + 0.275335474764726041141e-4, + 0.282243172016108031869e-6 + }; + double xs = x - 6; + double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); + result = Y * x + R * x; + } + else if(x < 44) + { + // Max error found: 5.697761e-20 + static const float Y = 0.99714565277099609375f; + static const double P[] = { + -0.0024978212791898131227, + -0.779190719229053954292e-5, + 0.254723037413027451751e-4, + 0.162397777342510920873e-5, + 0.396341011304801168516e-7, + 0.411632831190944208473e-9, + 0.145596286718675035587e-11, + -0.116765012397184275695e-17 + }; + static const double Q[] = { + 1.0, + 0.207123112214422517181, + 0.0169410838120975906478, + 0.000690538265622684595676, + 0.145007359818232637924e-4, + 0.144437756628144157666e-6, + 0.509761276599778486139e-9 + }; + double xs = x - 18; + double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); + result = Y * x + R * x; + } + else + { + // Max error found: 1.279746e-20 + static const float Y = 0.99941349029541015625f; + static const double P[] = { + -0.000539042911019078575891, + -0.28398759004727721098e-6, + 0.899465114892291446442e-6, + 0.229345859265920864296e-7, + 0.225561444863500149219e-9, + 0.947846627503022684216e-12, + 0.135880130108924861008e-14, + -0.348890393399948882918e-21 + }; + static const double Q[] = { + 1.0, + 0.0845746234001899436914, + 0.00282092984726264681981, + 0.468292921940894236786e-4, + 0.399968812193862100054e-6, + 0.161809290887904476097e-8, + 0.231558608310259605225e-11 + }; + double xs = x - 44; + double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); + result = Y * x + R * x; + } + } + return result; +} + +inline double erfcinv(double z) +{ + // + // Begin by testing for domain errors, and other special cases: + // + if((z < 0) || (z > 2)) + return NAN; + if(z == 0) + return INFINITY; + if(z == 2) + return -INFINITY; + // + // Normalise the input, so it's in the range [0,1], we will + // negate the result if z is outside that range. This is a simple + // application of the erfc reflection formula: erfc(-z) = 2 - erfc(z) + // + double p, q, s; + if(z > 1) + { + q = 2 - z; + p = 1 - q; + s = -1; + } + else + { + p = 1 - z; + q = z; + s = 1; + } + + // + // And get the result, negating where required: + // + return s * erfinv_imp(p, q); +} + +inline double erfinv(double z) +{ + // + // Begin by testing for domain errors, and other special cases: + // + if((z < -1) || (z > 1)) + return NAN; + if(z == 1) + return INFINITY; + if(z == -1) + return -INFINITY; + if(z == 0) + return 0; + // + // Normalise the input, so it's in the range [0,1], we will + // negate the result if z is outside that range. This is a simple + // application of the erf reflection formula: erf(-z) = -erf(z) + // + double p, q, s; + if(z < 0) + { + p = -z; + q = 1 - p; + s = -1; + } + else + { + p = z; + q = 1 - z; + s = 1; + } + + // + // And get the result, negating where required: + // + return s * erfinv_imp(p, q); +} + +inline float erfcinvf(float z) { + return erfcinv(z); +} + +inline float erfinvf(float z) { + return erfinv(z); +} + +#endif // ifndef __CUDACC__ + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise/kern_defs.cuh b/dnn/src/common/elemwise/kern_defs.cuh new file mode 100644 index 00000000..49bd21de --- /dev/null +++ b/dnn/src/common/elemwise/kern_defs.cuh @@ -0,0 +1,225 @@ +/** + * \file dnn/src/common/elemwise/kern_defs.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/common/opr_param_defs_enumv.cuh" +#include "src/common/elemwise_helper.cuh" +#include "src/common/utils.cuh" +#include "src/common/elemwise/erfinv.h" + +#include "megcore_cdefs.h" +#include "megdnn/dtype.h" + +#include +#include + +#if MEGDNN_CC_HOST +#include +using std::max; +using std::min; +#endif + +#ifndef MEGDNN_ELEMWISE_MODE_ENABLE +#define MEGDNN_ELEMWISE_MODE_ENABLE(_mode, _cb) _cb(_mode) +#define MEGDNN_ELEMWISE_MODE_ENABLE_ALL 1 +#endif + +#if MEGDNN_CC_HOST && !defined(__host__) +#define MEGDNN_HOST_DEVICE_SELF_DEFINE +#define __host__ +#define __device__ +#endif + +namespace megdnn { + + + template + __device__ __host__ inline T log_sum_exp(T x, T y) { + T a, b; + a = x < y ? x : y; + b = x < y ? y : x; + return T(b + log1pf(exp(a - b))); + } + + __device__ __host__ inline float fast_tanh(float x) { + return x * (27.f + x * x) / (27.f + 9.f * x * x); + } + + //! use multiplying (1.f / 6.f) to replace dividing 6.f, because we didn't + //! pass + //! --use_fast_math to nvcc to enable --prec_div optimization, which will + //! cause performance drop on Turing architecture + __device__ __host__ inline float fuse_add_hswish(float x, float y) { + float z = x + y; + return z * min(max(z + 3, 0.f), 6.f) * (1.f / 6.f); + } + + __device__ __host__ inline float fast_tanh_grad(float x, float dx) { + float x_pow2 = x * x; + float deno = 3.f + x_pow2; + return ((-48.f * x_pow2) / deno + 27.f + x_pow2) / (deno * 9.f) * dx; + } + +#include "src/common/elemwise/each_mode.inl" + + template + struct ElemwiseKern; + +//! define kernel for a single ctype +#define DEF_KERN(_ctype, _mode, _imp) \ + template \ + struct ElemwiseKern { \ + typedef _ctype ctype; \ + static __host__ __device__ _ctype apply(KERN_SIG) { \ + return ctype(_imp); \ + } \ + } + +//! define kernel for all float types +#define DEF_KERN_FLOAT(_mode, _imp) \ + DEF_KERN(dt_float32, _mode, _imp); \ + MEGDNN_INC_FLOAT16(DEF_KERN(dt_float16, _mode, _imp);) + +//! define kernel for all int types +#define DEF_KERN_INT(_mode, _imp) \ + DEF_KERN(dt_int32, _mode, _imp); \ + DEF_KERN(dt_int16, _mode, _imp); \ + DEF_KERN(dt_int8, _mode, _imp); \ + DEF_KERN(dt_uint8, _mode, _imp); \ + +//! define kernel for all ctypes +#define DEF_KERN_ALL(_mode, _imp) \ + DEF_KERN_INT(_mode, _imp); \ + DEF_KERN_FLOAT(_mode, _imp); \ + + /* ================== unary kernels ================== */ +#define KERN_SIG ctype x + + // int and float + DEF_KERN_ALL(NEGATE, -x); +#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) + DEF_KERN_INT(RELU, x <= ctype(0) ? ctype(0) : x); + DEF_KERN_FLOAT(RELU, x <= 0.f ? ctype(0) : x); +#else + DEF_KERN_ALL(RELU, x <= ctype(0) ? ctype(0) : x); +#endif + DEF_KERN_INT(ABS, abs(int(x))); + // DEF_KERN_INT(ABS, x > ctype(0) ? x : -x); + DEF_KERN_FLOAT(ABS, fabsf(x)); + + // float only + DEF_KERN_FLOAT(ACOS, acosf(x)); + DEF_KERN_FLOAT(ASIN, asinf(x)); + DEF_KERN_FLOAT(CEIL, ceilf(x)); + DEF_KERN_FLOAT(COS, cosf(x)); + DEF_KERN_FLOAT(EXP, expf(x)); + DEF_KERN_FLOAT(EXPM1, expm1f(x)); + DEF_KERN_FLOAT(FLOOR, floorf(x)); + DEF_KERN_FLOAT(LOG, logf(x)); + DEF_KERN_FLOAT(LOG1P, log1pf(x)); + DEF_KERN_FLOAT(SIGMOID, 1.f / (expf(-x) + 1.f)); + DEF_KERN_FLOAT(SIN, sinf(x)); + DEF_KERN_FLOAT(TANH, tanhf(x)); + DEF_KERN_FLOAT(FAST_TANH, fast_tanh(x)); + DEF_KERN_FLOAT(ROUND, roundf(x)); + DEF_KERN_FLOAT(ERF, erff(x)); + DEF_KERN_FLOAT(ERFINV, erfinvf(x)); + DEF_KERN_FLOAT(ERFC, erfcf(x)); + DEF_KERN_FLOAT(ERFCINV, erfcinvf(x)); + DEF_KERN_FLOAT(H_SWISH, x * min(max(x + 3, 0.f), 6.f) * (1.f / 6.f)); + + // int only + +#undef KERN_SIG + + /* ================== binary kernels ================== */ +#define KERN_SIG ctype x, ctype y + + // int and float +#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) + DEF_KERN_INT(ABS_GRAD, x > ctype(0) ? y : -y); + DEF_KERN_FLOAT(ABS_GRAD, x > 0.f ? y : -y); +#else + DEF_KERN_ALL(ABS_GRAD, x > ctype(0) ? y : -y); +#endif + DEF_KERN_ALL(ADD, x + y); + DEF_KERN_ALL(MAX, x > y ? x : y); + DEF_KERN_ALL(MIN, x < y ? x : y); + DEF_KERN_ALL(MUL, x* y); + DEF_KERN_INT(RMULH, round_mulh_saturate(x, y)); + DEF_KERN_ALL(SIGMOID_GRAD, x*(ctype(1) - x) * y); + DEF_KERN_ALL(SUB, x - y); +#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) + DEF_KERN_INT(SWITCH_GT0, x > ctype(0) ? y : ctype(0)); + DEF_KERN_FLOAT(SWITCH_GT0, x > 0.f ? y : ctype(0)); +#else + DEF_KERN_ALL(SWITCH_GT0, x > ctype(0) ? y : ctype(0)); +#endif + DEF_KERN_ALL(TANH_GRAD, (ctype(1) - x * x) * y); + DEF_KERN_ALL(LT, x < y); + DEF_KERN_ALL(LEQ, x <= y); + DEF_KERN_ALL(EQ, x == y); + + DEF_KERN_INT(FLOOR_DIV, x / y); + DEF_KERN_FLOAT(FLOOR_DIV, floorf(x / y)); + + DEF_KERN_INT(MOD, x % y); + DEF_KERN_FLOAT(MOD, fmodf(x, y)); + + DEF_KERN_INT(SHL, x << y); + DEF_KERN_INT(SHR, x >> y); +#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) + DEF_KERN_INT(FUSE_ADD_RELU, (x + y) <= ctype(0) ? ctype(0) : (x + y)); + DEF_KERN_FLOAT(FUSE_ADD_RELU, (x + y) <= 0.f ? ctype(0) : (x + y)); +#else + DEF_KERN_ALL(FUSE_ADD_RELU, + (x + y) <= ctype(0) ? ctype(0) : (x + y)); +#endif + + // float only + DEF_KERN_FLOAT(TRUE_DIV, x / y); + DEF_KERN_FLOAT(POW, powf(x, y)); + DEF_KERN_FLOAT(LOG_SUM_EXP, log_sum_exp(x, y)); + DEF_KERN_FLOAT(FAST_TANH_GRAD, fast_tanh_grad(x, y)); + + DEF_KERN_FLOAT(FUSE_ADD_TANH, tanhf(x+y)); + DEF_KERN_FLOAT(FUSE_ADD_SIGMOID, 1.f / (expf(-(x+y)) + 1.f)); + + DEF_KERN_FLOAT(ATAN2, atan2f(x, y)); + DEF_KERN_FLOAT(H_SWISH_GRAD, + x < -3.f ? 0.f : (x > 3.f ? y : (2.f * x + 3.f) / 6.f * y)); + + DEF_KERN_FLOAT(FUSE_ADD_H_SWISH, fuse_add_hswish(x, y)); +#undef KERN_SIG + + /* ================== ternary kernels ================== */ +#define KERN_SIG ctype x, ctype y, ctype z + + // int and float + DEF_KERN_ALL(COND_LEQ_MOV, x <= y ? z : ctype(0)); + DEF_KERN_ALL(FUSE_MUL_ADD3, x * y + z); + +#undef KERN_SIG + + +#undef DEF_KERN_AD +#undef DEF_KERN + +} // namespace megdnn + +#if MEGDNN_CC_HOST && defined(MEGDNN_HOST_DEVICE_SELF_DEFINE) +#undef MEGDNN_HOST_DEVICE_SELF_DEFINE +#undef __host__ +#undef __device__ +#endif + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise/opr_impl.cpp b/dnn/src/common/elemwise/opr_impl.cpp new file mode 100644 index 00000000..c8d30c26 --- /dev/null +++ b/dnn/src/common/elemwise/opr_impl.cpp @@ -0,0 +1,289 @@ +/** + * \file dnn/src/common/elemwise/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/elemwise/kern_defs.cuh" +#include "src/common/utils.h" + +#include "megdnn/oprs.h" +#include "megdnn/tensor_format.h" + +#include "midout.h" +MIDOUT_DECL(megdnn_common_elemwise) + +#include +#include + +using namespace megdnn; + +namespace { +class FormatDeducer { + const TensorFormat m_default; + TensorFormat m_result = m_default; + +public: + inline void feed(TensorFormat cur); + bool is_default(TensorFormat f) const { return f == m_default; } + TensorFormat get() const { return m_result; } +}; +} // anonymous namespace + +using Mode = param::Elemwise::Mode; +using ModeTrait = ElemwiseForward::ModeTrait; + +const ModeTrait& ModeTrait::from_mode(Mode mode) { + static std::mutex mtx; + static std::vector traits; + + std::lock_guard _lock(mtx); + + if (traits.empty()) { + auto get = [&](Mode m) -> ModeTrait& { + auto im = static_cast(m); + if (im >= traits.size()) + traits.resize(im + 1); + return traits[im]; + }; + +#define cb(_m) \ + MIDOUT_BEGIN(megdnn_common_elemwise, midout_iv(Mode::_m)) { \ + get(Mode::_m).allow_int = true; \ + } \ + MIDOUT_END(); + MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_INT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_INT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_INT(cb); +#undef cb + +#define cb(_m) \ + MIDOUT_BEGIN(megdnn_common_elemwise, midout_iv(Mode::_m)) { \ + get(Mode::_m).allow_float = true; \ + } \ + MIDOUT_END(); + MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_FLOAT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_FLOAT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_FLOAT(cb); +#undef cb + +#define cb(_m) \ + MIDOUT_BEGIN(megdnn_common_elemwise, midout_iv(Mode::_m)) { \ + auto&& t = get(Mode::_m); \ + t.arity = _a; \ + t.name = megdnn_mangle(#_m); \ + } \ + MIDOUT_END(); +#define _a 1 + MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_FLOAT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_INT(cb); +#undef _a +#define _a 2 + MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_FLOAT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_INT(cb); +#undef _a +#define _a 3 + MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_FLOAT(cb); + MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_INT(cb); +#undef _a +#undef cb + +#define FUSE(_m, _arity) \ + MIDOUT_BEGIN(megdnn_common_elemwise, midout_iv(Mode::_m)) { \ + auto&& t = get(Mode::_m); \ + t.allow_int = true; \ + t.allow_float = true; \ + t.arity = _arity; \ + t.name = megdnn_mangle(#_m); \ + } \ + MIDOUT_END(); + FUSE(FUSE_MUL_ADD3, 3); + FUSE(FUSE_MUL_ADD4, 4); +#undef FUSE + +#define COMM_CB(_m) \ + MIDOUT_BEGIN(megdnn_common_elemwise, midout_iv(Mode::_m)) { \ + traits.at(static_cast(Mode::_m)).commutable = true; \ + } \ + MIDOUT_END() +#define COMM(_m) MEGDNN_ELEMWISE_MODE_ENABLE(_m, COMM_CB) + + COMM(ADD); + COMM(FUSE_ADD_RELU); + COMM(FUSE_ADD_SIGMOID); + COMM(FUSE_ADD_TANH); + COMM(MUL); + COMM(RMULH); + COMM(MAX); + COMM(MIN); + COMM(EQ); + COMM(LOG_SUM_EXP); + +#undef COMM +#undef COMM_CB + +#if MEGDNN_ELEMWISE_MODE_ENABLE_ALL + for (auto&& i : traits) { + megdnn_assert(i.arity && (i.allow_int || i.allow_float) && + (!i.commutable || i.arity == 2)); + } +#else +#pragma message "elemwise mode stripped" +#endif + } + + auto&& ret = traits.at(static_cast(mode)); +#if !MEGDNN_ELEMWISE_MODE_ENABLE_ALL + megdnn_assert(ret.arity); +#endif + return ret; +} + +void ElemwiseForward::deduce_shape(const TensorShapeArray& src, + TensorShape& dst) { + auto err = [&]() { + std::string msg( + megdnn_mangle("bad input shape for polyadic operator: ")); + bool first = true; + for (auto&& i : src) { + if (first) + first = false; + else + msg.append(megdnn_mangle(", ")); + msg.append(i.to_string()); + } + megdnn_throw(msg); + }; + + dst.ndim = 0; + for (auto&& cur : src) { + if (!cur.ndim) + err(); + if (!dst.ndim || dst.is_scalar()) + dst = cur; + else if (!cur.is_scalar()) { + int max_ndim = std::max(cur.ndim, dst.ndim); + for (int i = 0; i < max_ndim; ++i) { + int cur_idx = cur.ndim - i - 1; + int dst_idx = dst.ndim - i - 1; + if (cur_idx >= 0 && dst_idx >= 0) { + size_t v0 = dst.shape[dst_idx], v1 = cur.shape[cur_idx]; + if (v0 != v1) { + if (v0 != 1 && v1 != 1) + err(); + } + int final_idx = std::max(cur_idx, dst_idx); + dst.shape[final_idx] = std::max(v0, v1); + } else { + if (dst_idx < 0) { + dst.shape[cur_idx] = cur.shape[cur_idx]; + } + } + } + dst.ndim = max_ndim; + } + } +} + +void FormatDeducer::feed(TensorFormat cur) { + // only one kind of non-default format can exist; and in such case the + // layouts with default format must be scalar (checked in deduce_layout) + if (cur == m_default) + return; + + if (m_result == m_default) { + m_result = cur; + } else { + megdnn_assert(m_result == cur, + "different input layout formats in elemwise: %s vs %s", + m_result.impl()->to_string().c_str(), + cur.impl()->to_string().c_str()); + } +} + +void ElemwiseForward::deduce_format(const TensorFormatArray& src, + TensorFormat& dst) { + FormatDeducer d; + for (auto i : src) { + d.feed(i); + } + dst = d.get(); +} + +void ElemwiseForward::deduce_layout(const TensorLayoutArray& src, + TensorLayout& dst) { + megdnn_assert(src.size() == mode_trait().arity); + DType dtype; + FormatDeducer format_deducer; + for (auto&& i : src) { + if (!dtype.valid()) { + dtype = i.dtype; + dst.format = i.format; + } else { + megdnn_assert(dtype == i.dtype, + "input dtype not unique: get %s and %s", dtype.name(), + i.dtype.name()); + } + + format_deducer.feed(i.format); + } + dst.format = format_deducer.get(); + if (!format_deducer.is_default(dst.format)) { + for (auto&& i : src) { + if (format_deducer.is_default(i.format)) { + megdnn_assert( + i.collapse_contiguous().is_scalar(), + "default format can only be used on scalar, got %s", + i.to_string().c_str()); + } + } + } + + check_dtype(dtype); + TensorShapeArray src_shp; + for (auto&& i : src) + src_shp.push_back(i); + deduce_shape(src_shp, dst); + dst.dtype = dtype; + dst.init_contiguous_stride(); +} + +void ElemwiseForward::check_layout_and_broadcast( + const TensorLayoutPtrArray& src, const TensorLayout& dst) { + megdnn_assert(src.size() == mode_trait().arity); + DType dtype; + for (auto i : src) { + if (!dtype.valid()) { + dtype = i->dtype; + } else { + megdnn_assert(dtype == i->dtype); + } + *i = i->broadcast(dst); + } + check_dtype(dtype); + megdnn_assert(dtype == dst.dtype && dst.is_contiguous()); +} + +void ElemwiseForward::check_dtype(DType dtype) { + megdnn_assert(dtype.valid()); + auto&& trait = mode_trait(); + switch (dtype.category()) { + case DTypeCategory::FLOAT: + megdnn_assert(trait.allow_float, "unsupport mode %s for float\n", + trait.name); + break; + case DTypeCategory::INT: + megdnn_assert(trait.allow_int, "unsupport mode %s for int\n", + trait.name); + break; + default: + megdnn_throw("bad dtype"); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise/opr_impl_body.inl b/dnn/src/common/elemwise/opr_impl_body.inl new file mode 100644 index 00000000..7cbcaaa6 --- /dev/null +++ b/dnn/src/common/elemwise/opr_impl_body.inl @@ -0,0 +1,107 @@ +/** + * \file dnn/src/common/elemwise/opr_impl_body.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#ifndef on_arity_dispatched_cb_dtype +#error "on_arity_dispatched_cb_dtype and IMPL_MODE_DISPATCHER must be defined" +#endif + +template +void ElemwiseForwardImpl::on_arity_dispatched() { + auto src = make_elemwise_op_param(); + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(on_arity_dispatched_cb_dtype) + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(on_arity_dispatched_cb_dtype) + megdnn_throw("bad dtype"); +} + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_INT +IMPL_MODE_DISPATCHER(1, DTypeCategory::INT); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_INT +IMPL_MODE_DISPATCHER(2, DTypeCategory::INT); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_INT +IMPL_MODE_DISPATCHER(3, DTypeCategory::INT); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_FLOAT +IMPL_MODE_DISPATCHER(1, DTypeCategory::FLOAT); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_FLOAT +IMPL_MODE_DISPATCHER(2, DTypeCategory::FLOAT); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_FLOAT +IMPL_MODE_DISPATCHER(3, DTypeCategory::FLOAT); +#undef FOREACH + +void ElemwiseForwardImpl::exec( + const TensorNDArray &src, + _megdnn_tensor_out dst) { + m_src = &src; + m_dst = &dst; + +#define CB_CHK_MODE_ENABLE(_) 1 + if (m_param.mode == Mode::FUSE_MUL_ADD3) { +#if MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, CB_CHK_MODE_ENABLE) +0 + ElemwiseOpParamN<3> param; + bool c_is_scalar; + prepare_fma3(param, c_is_scalar); + switch(m_dst->layout.dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + { \ + using ctype = DTypeTrait<_dt>::ctype; \ + if (c_is_scalar) { \ + return impl_fuse_mul_add3(param); \ + } else { \ + return impl_fuse_mul_add3(param); \ + } \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + default: + megdnn_throw("bad dtype"); + } +#endif // enable FUSE_MUL_ADD3 + } else if (m_param.mode == Mode::FUSE_MUL_ADD4) { +#if MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD4, CB_CHK_MODE_ENABLE) +0 + ElemwiseOpParamN<4> param; + prepare_fma4(param); + + switch(m_dst->layout.dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return impl_fuse_mul_add4::ctype>(param); + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + default: + megdnn_throw("bad dtype"); + } +#endif // enable FUSE_MUL_ADD4 + } + +#undef CB_CHK_MODE_ENABLE + + switch(src.size()) { +#define D(_n) case _n: return on_arity_dispatched<_n>() + D(1); + D(2); + D(3); +#undef D + default: + megdnn_throw("bad size of input tensors"); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise/opr_impl_class_def.inl b/dnn/src/common/elemwise/opr_impl_class_def.inl new file mode 100644 index 00000000..cab89521 --- /dev/null +++ b/dnn/src/common/elemwise/opr_impl_class_def.inl @@ -0,0 +1,40 @@ +/** + * \file dnn/src/common/elemwise/opr_impl_class_def.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + + protected: + template + void on_arity_dispatched(); + + template + struct ModeDispatcher; + + /*! + * \brief special impl for FUSE_MUL_ADD3 mode + * \tparam c_is_scalar see ElemwiseForwardImplHelper::prepare_fma3 + */ + template + void impl_fuse_mul_add3(const ElemwiseOpParamN<3> ¶ms); + + /*! + * \brief special impl for FUSE_MUL_ADD4 mode + * \param[out] params see ElemwiseForwardImplHelper::prepare_fma4 + */ + template + void impl_fuse_mul_add4(const ElemwiseOpParamN<4> ¶ms); + + public: + using ElemwiseForwardImplHelper::ElemwiseForwardImplHelper; + + void exec( + const TensorNDArray &src, + _megdnn_tensor_out dst) override; + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise/opr_impl_helper.cpp b/dnn/src/common/elemwise/opr_impl_helper.cpp new file mode 100644 index 00000000..04a9de1f --- /dev/null +++ b/dnn/src/common/elemwise/opr_impl_helper.cpp @@ -0,0 +1,162 @@ +/** + * \file dnn/src/common/elemwise/opr_impl_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl_helper.h" +#include "src/common/utils.h" + +using namespace megdnn; + +template +ElemwiseOpParamN ElemwiseLayoutHelper::make_elemwise_op_param( + void* opr, + void (*check_layout_and_broadcast)(void*, const TensorLayoutPtrArray&, + const TensorLayout&), + const TensorNDArray& src, const TensorND& dst) { + megdnn_assert(src.size() == static_cast(arity)); + ElemwiseOpParamN ret; + TensorLayoutPtrArray src_layouts(arity); + for (int i = 0; i < arity; ++i) { + ret.param[i] = src[i]; + src_layouts[i] = &ret.param[i].layout; + } + check_layout_and_broadcast(opr, src_layouts, dst.layout); + ret.init_from_given_tensor(); + return ret; +} + +// explicit instantiation so subclasses can call this method +#define INST(n) \ + template ElemwiseOpParamN \ + ElemwiseLayoutHelper::make_elemwise_op_param( \ + void*, \ + void (*)(void*, const TensorLayoutPtrArray&, const TensorLayout&), \ + const TensorNDArray&, const TensorND&) +INST(1); +INST(2); +INST(3); +INST(4); +INST(5); +INST(6); +#undef INST + +void ElemwiseForwardImplHelper::prepare_fma3(ElemwiseOpParamN<3>& param, + bool& c_is_scalar) { + c_is_scalar = is_broadcasted_scalar(m_src->at(2).layout); + param = make_elemwise_op_param<3>(); + + if (!c_is_scalar && !param[2].layout.eq_layout(param[0].layout)) { + megdnn_assert_eq_layout(param[2].layout, param[1].layout); + std::swap(param[0], param[1]); + } + if (c_is_scalar && param[2].layout.eq_layout(param[0].layout)) { + std::swap(param[0], param[1]); + } +} + +void ElemwiseForwardImplHelper::prepare_fma4(ElemwiseOpParamN<4>& param) { + param = make_elemwise_op_param<4>(); + if (!param[0].layout.eq_layout(param[2].layout)) + std::swap(param[0], param[1]); + + megdnn_assert_eq_layout(param[0].layout, param[2].layout); + megdnn_assert_eq_layout(param[1].layout, param[3].layout); +} + +bool ElemwiseLayoutHelper::is_broadcasted_scalar(const TensorLayout& layout) { + if (layout.format.type() != TensorFormat::Type::DEFAULT) + return false; + for (size_t i = 0; i < layout.ndim; ++i) { + if (layout.shape[i] != 1 && layout.stride[i] != 0) + return false; + } + return true; +} +bool ElemwiseLayoutHelper::is_broadcastedx_channel_like( + const TensorLayout& layout, BroadcastChannelInfo& info) { + if (layout.format.type() == TensorFormat::Type::DEFAULT && + layout.ndim == 3 && layout.stride[0] == 8 && layout.stride[1] == 0 && + layout.stride[2] == 1) { + info.x = layout.shape[0]; + info.y = layout.shape[1]; + info.z = layout.shape[2]; + return true; + } else if (layout.format.type() == TensorFormat::Type::DEFAULT && + layout.ndim == 4 && layout.stride[0] == 0 && + layout.stride[1] == 8 && layout.stride[2] == 0 && + layout.stride[3] == 1) { + info.x = layout.shape[1]; + info.y = layout.shape[2]; + info.z = layout.shape[3]; + return true; + } + return false; +} + +bool ElemwiseLayoutHelper::is_broadcasted_channel_like( + const TensorLayout& layout, BroadcastChannelInfo& info) { + if (layout.format.type() == TensorFormat::Type::DEFAULT) { + if (layout.ndim == 3 && layout.stride[0] == 0 && + layout.stride[2] == 0 && layout.stride[1] == 1) { + info.x = layout.shape[0]; + info.y = layout.shape[1]; + info.z = layout.shape[2]; + return true; + } else if (layout.ndim == 2 && layout.stride[1] == 0 && + layout.stride[0] == 1) { + info.x = 1; + info.y = layout.shape[0]; + info.z = layout.shape[1]; + return true; + } + } else { + if (Image2DPack4TensorFormat::is_valid_image(layout)) { + auto align_axis = layout.format.as_impl() + .align_axis(); + if (layout.ndim == 4 && align_axis == 1 && + (layout.stride[0] == 0 || layout.shape[0] == 1) && + layout.stride[1] == 4 && layout.stride[2] == 0 && + layout.stride[3] == 1) { + info.x = 1; + info.y = 1; + info.z = layout.shape[2]; + return true; + } else if (layout.ndim == 3 && align_axis == 1 && + (layout.stride[0] == 0 || layout.shape[0] == 1) && + layout.stride[1] == 0 && layout.shape[2] == 4 && + layout.stride[2] == 1) { + //! [1, 1, 1, 1, 4] + [N, H, 1, W, 4] + info.x = 1; + info.y = 1; + info.z = layout.shape[1]; + return true; + } + return false; + } + } + return false; +} + +bool ElemwiseLayoutHelper::is_broadcasted_1x(const TensorLayout& layout, + Broadcast1xInfo& binfo) { + if (layout.ndim == 2 && layout.stride[0] == 0 && layout.stride[1] == 1) { + binfo.x = layout[0]; + binfo.y = layout[1]; + return true; + } + if (layout.ndim == 1 && layout.stride[0] == 1) { + binfo.x = 1; + binfo.y = layout[0]; + return true; + } + return false; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise/opr_impl_helper.h b/dnn/src/common/elemwise/opr_impl_helper.h new file mode 100644 index 00000000..eb31983f --- /dev/null +++ b/dnn/src/common/elemwise/opr_impl_helper.h @@ -0,0 +1,138 @@ +/** + * \file dnn/src/common/elemwise/opr_impl_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs/general.h" +#include "megdnn/tensor_format.h" + +#include "src/common/elemwise_helper.cuh" +#include "src/common/utils.h" + +namespace megdnn { +class ElemwiseLayoutHelper { +public: + //! describe broadcasted [1, y, 1] to [x, y, z] + struct BroadcastChannelInfo { + size_t x, y, z; + + bool operator==(const BroadcastChannelInfo& rhs) const { + return x == rhs.x && y == rhs.y && z == rhs.z; + } + }; + + //! describe broadcasted [1, y] to [x, y] + struct Broadcast1xInfo { + size_t x, y; + + bool operator==(const Broadcast1xInfo& rhs) const { + return x == rhs.x && y == rhs.y; + } + }; + + /*! + * \brief check layout and get canonized op param + * \param opr operator pointer + * \param check_layout_and_broadcast function pointer to implement + * check_layout_and_broadcast(); operator pointer would be passed + * to it + */ + template + static ElemwiseOpParamN make_elemwise_op_param( + void* opr, + void (*check_layout_and_broadcast)(void*, + const TensorLayoutPtrArray&, + const TensorLayout&), + const TensorNDArray& src, const TensorND& dst); + + //! check whether given layout is 1D contig + static bool is_vector(const TensorLayout& layout) { + if (layout.format.type() != TensorFormat::Type::DEFAULT) { + return layout.is_contiguous(); + } + return layout.ndim == 1 && layout.stride[0] == 1; + } + + /*! + * \brief check whether it is compatible with (1, x) broadcasted into (y, x) + * + * Note: input can be one-dimensional. + */ + static bool is_broadcasted_1x(const TensorLayout& layout, + Broadcast1xInfo& binfo); + + //! check whether given layout is broadcasted scalar + static bool is_broadcasted_scalar(const TensorLayout& layout); + + /*! + * \brief check whether layout matches BroadcastChannelInfo + * + * Note that Input can also be 2-dimensional, and must be [y, 1] broadacsted + * into [y, z]; in such case x would be set to 1. + */ + static bool is_broadcasted_channel_like(const TensorLayout& layout, + BroadcastChannelInfo& info); + + /*! + * \brief check whether layout matches BroadcastChannelInfo + * + * Note that Input can also be 3-dimensional, and must be [x, 1, z] + * broadacsted into [x, y, z] + */ + static bool is_broadcastedx_channel_like(const TensorLayout& layout, + BroadcastChannelInfo& info); +}; + +class ElemwiseForwardImplHelper : public ElemwiseForward, + protected ElemwiseLayoutHelper { + static void call_check_layout_and_broadcast(void* opr, + const TensorLayoutPtrArray& src, + const TensorLayout& dst) { + return static_cast(opr) + ->check_layout_and_broadcast(src, dst); + } + +protected: + const TensorNDArray* m_src = nullptr; + const TensorND* m_dst = nullptr; + + /*! + * \brief check layout and get canonized op param + * + * Require that m_src and m_dst have been setup + */ + template + ElemwiseOpParamN make_elemwise_op_param() { + return ElemwiseLayoutHelper::make_elemwise_op_param( + this, call_check_layout_and_broadcast, *m_src, *m_dst); + } + + /*! + * \brief canonize params for FMA3 + * \param[out] c_is_scalar if true, params[2] has same layout as + * params[0]; otherwise params[2] is scalar + */ + void prepare_fma3(ElemwiseOpParamN<3>& param, bool& c_is_scalar); + + /*! + * \brief canonize params for FMA4 + * \param[out] guaranteed that params[2] has same layout as + * params[0], and params[3] same with params[1]. + */ + void prepare_fma4(ElemwiseOpParamN<4>& param); + +public: + using ElemwiseForward::ElemwiseForward; +}; + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise_helper.cpp b/dnn/src/common/elemwise_helper.cpp new file mode 100644 index 00000000..6e57d2bb --- /dev/null +++ b/dnn/src/common/elemwise_helper.cpp @@ -0,0 +1,52 @@ +/** + * \file dnn/src/common/elemwise_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/elemwise_helper.cuh" +#include "src/common/utils.h" + +namespace megdnn { + + template + void ElemwiseOpParamN::init_from_given_tensor() { + megdnn_assert(!size && max_ndim == -1); + max_ndim = 0; + for (int i = 0; i < arity; ++ i) { + TensorLayout &layout = param[i].layout; + layout = layout.collapse_contiguous(); + auto cur = layout.total_nr_elems(); + if (!i) { + size = cur; + } else { + megdnn_assert(size == cur); + } + max_ndim = std::max(max_ndim, layout.ndim); + } + megdnn_assert(size > 0 && max_ndim > 0); + } + + template + void ElemwiseOpParamN::assert_initialized() const { + megdnn_assert(size, "uninitialized ElemwiseOpParamN"); + } + + template struct ElemwiseOpParamN<6>; + template struct ElemwiseOpParamN<5>; + template struct ElemwiseOpParamN<4>; + template struct ElemwiseOpParamN<3>; + template struct ElemwiseOpParamN<2>; + template struct ElemwiseOpParamN<1>; + + void ElemwiseOpParamN<0>::assert_initialized() const { + megdnn_assert(size, "uninitialized ElemwiseOpParamN"); + } +} + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise_helper.cuh b/dnn/src/common/elemwise_helper.cuh new file mode 100644 index 00000000..3b1a5668 --- /dev/null +++ b/dnn/src/common/elemwise_helper.cuh @@ -0,0 +1,116 @@ +/** + * \file dnn/src/common/elemwise_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/basic_types.h" + +namespace { + +template +struct MulType {}; +template<> struct MulType { typedef int16_t type; }; +template<> struct MulType { typedef int32_t type; }; +template<> struct MulType { typedef int64_t type; }; +template<> struct MulType { typedef uint16_t type; }; + +} // namespace + +namespace megdnn { + +/*! + * \brief packed param for elemwise operators + * \tparam arity number of operands for this operator + */ +template +struct ElemwiseOpParamN { + int max_ndim; //!< max ndim of all params + size_t size; //!< total number of elements (i.e. size of each param) + + TensorND param[arity]; + + ElemwiseOpParamN(): + max_ndim(-1), size(0) + {} + + const TensorND& operator [](int idx) const { + return param[idx]; + } + + TensorND& operator [](int idx) { + return param[idx]; + } + + /*! + * \brief initialize from current *param* + * + * *size* and *max_ndim* would be computed; params would be collapsed + * + * Each param must have the same number of elements. + */ + void init_from_given_tensor(); + + void assert_initialized() const; +}; + +/*! + * \brief for elemwise opr without tensor arguments (i.e. only need index input) + */ +template<> +struct ElemwiseOpParamN<0> { + size_t size; //!< total number of elements + + ElemwiseOpParamN(size_t s = 0): + size(s) + { + } + + void assert_initialized() const; +}; + +template +MEGDNN_DEVICE MEGDNN_HOST inline T rounding_shift_right_away_from_zero(T x, + int k) { + T mask = (T(1) << k) - 1; + T threshold = (mask >> 1) + (x < 0); + return (x >> k) + ((x & mask) > threshold); +} + +template +MEGDNN_DEVICE MEGDNN_HOST inline T rounding_shift_right_upward(T x, int k) { + T mask = (T(1) << k) - 1; + T threshold = mask >> 1; + return (x >> k) + ((x & mask) > threshold); +} + +template +MEGDNN_DEVICE MEGDNN_HOST inline T round_mulh_saturate(T a, T b) { + MEGDNN_STATIC_ASSERT(std::numeric_limits::digits <= 32, + "Portable RMULH is not supported for integer " + "types larger than 32 bits."); + MEGDNN_STATIC_ASSERT(std::numeric_limits::is_integer, + "Input types should be integer for RMULH"); + bool overflow = a == b && a == DTypeTrait::min(); + // TODO: This really should be + // rounding_shift_right_away_from_zero, but we haven't yet found a fast way + // to implement it on ARM NEON. For now, we just try to align with NEON's + // VQRDMULH and hope that it does not harm our NN badly. + return overflow ? DTypeTrait::max() + : static_cast(rounding_shift_right_upward( + typename MulType::type(a) * + typename MulType::type(b), + std::numeric_limits::digits)); +} + +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/common/elemwise_multi_type/kern_defs.cuh b/dnn/src/common/elemwise_multi_type/kern_defs.cuh new file mode 100644 index 00000000..5527c602 --- /dev/null +++ b/dnn/src/common/elemwise_multi_type/kern_defs.cuh @@ -0,0 +1,46 @@ +/** + * \file dnn/src/common/elemwise_multi_type/kern_defs.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/dtype.h" +#include "src/common/utils.cuh" +#include "src/common/elemwise_helper.cuh" + +#include + +namespace megdnn { +namespace elemwise_multi_type { + +template +struct Fma3iXxf32xf32xiYOp { + MEGDNN_HOST MEGDNN_DEVICE dtype operator()(stype x, float k, float b) { + const float MIN = static_cast(DTypeTrait::min()); + const float MAX = static_cast(DTypeTrait::max()); + float fv = rint(k * static_cast(x) + b); + return static_cast(fv >= MIN ? (fv <= MAX ? fv : MAX) : MIN); + } +}; + +template +MEGDNN_HOST MEGDNN_DEVICE dtype round_shr_saturate(stype x, int k) { + stype result = rounding_shift_right_away_from_zero(x, k); + if (!is_same::value) { + result = std::min(result, std::numeric_limits::max()); + result = std::max(result, std::numeric_limits::min()); + } + return static_cast(result); +} + +} // namespace elemwise_multi_type +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise_multi_type/opr_impl.cpp b/dnn/src/common/elemwise_multi_type/opr_impl.cpp new file mode 100644 index 00000000..edc749d8 --- /dev/null +++ b/dnn/src/common/elemwise_multi_type/opr_impl.cpp @@ -0,0 +1,261 @@ +/** + * \file dnn/src/common/elemwise_multi_type/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +#include "midout.h" +MIDOUT_DECL(megdnn_common_elemwise_multi_type) + +using namespace megdnn; + +using Mode = ElemwiseMultiType::Mode; +using ModeTrait = ElemwiseMultiType::ModeTrait; + +namespace { +void check_dtype(const ModeTrait& trait, size_t i, const TensorLayout& src) { + trait.check_inp[i](src.dtype); +} +} // anonymous namespace + +const ModeTrait& ModeTrait::from_mode(Mode mode) { + static std::mutex mtx; + static std::vector traits; + + std::lock_guard _lock(mtx); + + auto make_check_dtype_func = [](DType expected) { + auto func = [expected](DType dtype) { + megdnn_assert(expected.enumv() == dtype.enumv(), + "expected %s, but got %s", expected.name(), + dtype.name()); + }; + return func; + }; + + auto make_check_category = [](DTypeCategory expected) { + auto func = [expected](DType dtype) { + megdnn_assert(expected == dtype.category()); + }; + return func; + }; + + auto make_out_dtype_func = [](DType expected) { + auto func = [expected](DType& dtype, bool check) { + if (check) { + megdnn_assert(expected.enumv() == dtype.enumv(), + "expected %s, but got %s", expected.name(), + dtype.name()); + } else { + dtype = expected; + } + }; + return func; + }; + + auto make_out_category_func = [](DTypeCategory expected) { + auto func = [expected](DType& dtype, bool) { + megdnn_assert(expected == dtype.category()); + }; + return func; + }; + + if (traits.empty()) { + traits.resize(Param::MODE_NR_MEMBER); + auto init_fma3_int16x32x32x32 = [&](ModeTrait& dst, const char* name) { + dst.arity = 3; + dst.check_inp[0] = make_check_dtype_func(dtype::Int16()); + dst.check_inp[1] = make_check_dtype_func(dtype::Int32()); + dst.check_inp[2] = make_check_dtype_func(dtype::Int32()); + dst.check_out = make_out_dtype_func(dtype::Int32()); + dst.name = name; + }; + auto init_fma3_iXxf32xf32xi8 = [&](ModeTrait& dst, const char* name) { + dst.arity = 3; + dst.check_inp[0] = make_check_category(DTypeCategory::INT); + dst.check_inp[1] = make_check_dtype_func(dtype::Float32()); + dst.check_inp[2] = make_check_dtype_func(dtype::Float32()); + dst.check_out = make_out_dtype_func(dtype::Int8()); + dst.name = name; + }; + auto init_rshrs_iXxi8xi8 = [&](ModeTrait& dst, const char* name) { + dst.arity = 2; + dst.check_inp[0] = make_check_category(DTypeCategory::INT); + dst.check_inp[1] = make_check_dtype_func(dtype::Int8()); + dst.check_out = make_out_dtype_func(dtype::Int8()); + dst.name = name; + }; + auto init_fuse_add_rmulh_rshr_int16x16x16x8 = [&](ModeTrait& dst, + const char* name) { + // TODO: This is stupid, we should parameterize shift + // offset, minv and maxv. + dst.arity = 6; + + dst.check_inp[0] = make_check_dtype_func(dtype::Int16()); + dst.check_inp[1] = make_check_dtype_func(dtype::Int16()); + dst.check_inp[2] = make_check_dtype_func(dtype::Int16()); + dst.check_inp[3] = make_check_dtype_func(dtype::Int8()); + dst.check_inp[4] = make_check_dtype_func(dtype::Int8()); + dst.check_inp[5] = make_check_dtype_func(dtype::Int8()); + dst.check_out = make_out_dtype_func(dtype::Int8()); + dst.name = name; + }; + auto init_fuse_add_rmulh_rshr_int32x32x32x8 = [&](ModeTrait& dst, + const char* name) { + dst.arity = 6; + dst.check_inp[0] = make_check_dtype_func(dtype::Int32()); + dst.check_inp[1] = make_check_dtype_func(dtype::Int32()); + dst.check_inp[2] = make_check_dtype_func(dtype::Int32()); + dst.check_inp[3] = make_check_dtype_func(dtype::Int8()); + dst.check_inp[4] = make_check_dtype_func(dtype::Int8()); + dst.check_inp[5] = make_check_dtype_func(dtype::Int8()); + dst.check_out = make_out_dtype_func(dtype::Int8()); + dst.name = name; + }; + auto init_rshrs_iXxi8xi16 = [&](ModeTrait& dst, const char* name) { + dst.arity = 2; + dst.check_inp[0] = make_check_category(DTypeCategory::INT); + dst.check_inp[1] = make_check_dtype_func(dtype::Int8()); + dst.check_out = make_out_dtype_func(dtype::Int16()); + dst.name = name; + }; + + auto init_quantized_unary_op = [&](ModeTrait& dst, const char* name) { + dst.arity = 1; + dst.check_inp[0] = make_check_category(DTypeCategory::QUANTIZED); + dst.check_out = make_out_category_func(DTypeCategory::QUANTIZED); + dst.name = name; + dst.need_specify_out_dtype = true; + }; + + auto init_quantized_binary_op = [&](ModeTrait& dst, const char* name) { + dst.arity = 2; + dst.check_inp[0] = make_check_category(DTypeCategory::QUANTIZED); + dst.check_inp[1] = make_check_category(DTypeCategory::QUANTIZED); + dst.check_out = make_out_category_func(DTypeCategory::QUANTIZED); + dst.name = name; + dst.need_specify_out_dtype = true; + }; + + auto init_quantized_ternary_op = [&](ModeTrait& dst, const char* name) { + dst.arity = 3; + dst.check_inp[0] = make_check_category(DTypeCategory::QUANTIZED); + dst.check_inp[1] = make_check_category(DTypeCategory::QUANTIZED); + dst.check_inp[2] = make_check_category(DTypeCategory::QUANTIZED); + dst.check_out = make_out_category_func(DTypeCategory::QUANTIZED); + dst.name = name; + dst.need_specify_out_dtype = true; + }; + +#define SET(f, m) \ + MIDOUT_BEGIN(megdnn_common_elemwise_multi_type, midout_iv(Mode::m)) { \ + f(traits[static_cast(Mode::m)], megdnn_mangle(#m)); \ + } \ + MIDOUT_END(); + SET(init_fma3_int16x32x32x32, FUSE_MUL_ADD3_INT16x32x32x32); + SET(init_fma3_iXxf32xf32xi8, FUSE_MUL_ADD3_IXxF32xF32xI8); + SET(init_rshrs_iXxi8xi8, ROUND_SHR_SATURATE_IXxI8xI8); + SET(init_fuse_add_rmulh_rshr_int16x16x16x8, + FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8); + SET(init_fuse_add_rmulh_rshr_int32x32x32x8, + FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8); + SET(init_rshrs_iXxi8xi16, ROUND_SHR_SATURATE_IXxI8xI16); + + //! quantized opr, with specified dtype. + //! dispatch elemwise mode internally + SET(init_quantized_unary_op, QRELU); + SET(init_quantized_unary_op, QABS); + SET(init_quantized_unary_op, QACOS); + SET(init_quantized_unary_op, QASIN); + SET(init_quantized_unary_op, QCEIL); + SET(init_quantized_unary_op, QCOS); + SET(init_quantized_unary_op, QEXP); + SET(init_quantized_unary_op, QEXPM1); + SET(init_quantized_unary_op, QFLOOR); + SET(init_quantized_unary_op, QLOG); + SET(init_quantized_unary_op, QLOG1P); + SET(init_quantized_unary_op, QNEGATE); + SET(init_quantized_unary_op, QSIGMOID); + SET(init_quantized_unary_op, QSIN); + SET(init_quantized_unary_op, QTANH); + SET(init_quantized_unary_op, QFAST_TANH); + SET(init_quantized_unary_op, QROUND); + SET(init_quantized_unary_op, QERF); + SET(init_quantized_unary_op, QERFINV); + SET(init_quantized_unary_op, QERFC); + SET(init_quantized_unary_op, QERFCINV); + SET(init_quantized_unary_op, QH_SWISH); + + SET(init_quantized_binary_op, QABS_GRAD); + SET(init_quantized_binary_op, QADD); + SET(init_quantized_binary_op, QFLOOR_DIV); + SET(init_quantized_binary_op, QMAX); + SET(init_quantized_binary_op, QMIN); + SET(init_quantized_binary_op, QMOD); + SET(init_quantized_binary_op, QMUL); + SET(init_quantized_binary_op, QPOW); + SET(init_quantized_binary_op, QSIGMOID_GRAD); + SET(init_quantized_binary_op, QSUB); + SET(init_quantized_binary_op, QSWITCH_GT0); + SET(init_quantized_binary_op, QTANH_GRAD); + SET(init_quantized_binary_op, QTRUE_DIV); + SET(init_quantized_binary_op, QLOG_SUM_EXP); + + SET(init_quantized_binary_op, QLT); + SET(init_quantized_binary_op, QLEQ); + SET(init_quantized_binary_op, QEQ); + + SET(init_quantized_binary_op, QFUSE_ADD_RELU); + SET(init_quantized_binary_op, QFUSE_ADD_SIGMOID); + SET(init_quantized_binary_op, QFUSE_ADD_TANH); + SET(init_quantized_binary_op, QFAST_TANH_GRAD); + SET(init_quantized_binary_op, QATAN2); + SET(init_quantized_binary_op, QH_SWISH_GRAD); + SET(init_quantized_binary_op, QFUSE_ADD_H_SWISH); + + SET(init_quantized_ternary_op, QFUSE_MUL_ADD3); + SET(init_quantized_ternary_op, QCOND_LEQ_MOV); +#undef SET + } + + return traits.at(static_cast(mode)); +} + +void ElemwiseMultiType::deduce_layout(const TensorLayoutArray& src, + TensorLayout& dst) { + auto trait = mode_trait(); + megdnn_assert(src.size() == trait.arity); + for (size_t i = 0; i < trait.arity; ++i) { + check_dtype(trait, i, src[i]); + } + TensorShapeArray src_shp; + for (auto&& i : src) + src_shp.push_back(i); + Elemwise::deduce_shape(src_shp, dst); + dst.init_contiguous_stride(); + trait.check_out(dst.dtype, false); +} + +void ElemwiseMultiType::check_layout_and_broadcast( + const TensorLayoutPtrArray& src, const TensorLayout& dst) { + auto trait = mode_trait(); + megdnn_assert(src.size() == trait.arity); + for (size_t i = 0; i < trait.arity; ++i) { + check_dtype(trait, i, *src[i]); + *src[i] = src[i]->broadcast(dst); + } + auto dtype = dst.dtype; + trait.check_out(dtype, true); + megdnn_assert(dst.is_contiguous()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp b/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp new file mode 100644 index 00000000..34e44f1b --- /dev/null +++ b/dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp @@ -0,0 +1,109 @@ +/** + * \file dnn/src/common/elemwise_multi_type/opr_impl_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl_helper.h" +#include "src/common/utils.h" + +using namespace megdnn; + +#define ON_QUANTIZED_MODE(_MODE, _n) \ + case Mode::Q##_MODE: \ + on_quantized_mode(make_elemwise_op_param<_n>(src, dst), dst, \ + Elemwise::Mode::_MODE); \ + break + +void ElemwiseMultiTypeImplHelper::exec(_megdnn_in const TensorNDArray& src, + _megdnn_tensor_out dst) { + switch (m_param.mode) { + case Mode::FUSE_MUL_ADD3_INT16x32x32x32: + on_fuse_mul_add3_int16x32x32x32(make_elemwise_op_param<3>(src, dst), + dst.ptr()); + break; + case Mode::FUSE_MUL_ADD3_IXxF32xF32xI8: + on_fuse_mul_add3_iXxf32xf32xi8(make_elemwise_op_param<3>(src, dst), + dst.ptr()); + break; + case Mode::ROUND_SHR_SATURATE_IXxI8xI8: + on_round_shr_saturate_iXxi8xi8(make_elemwise_op_param<2>(src, dst), + dst.ptr()); + break; + case Mode::FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8: + on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( + make_elemwise_op_param<6>(src, dst), dst.ptr()); + break; + case Mode::FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8: + on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( + make_elemwise_op_param<6>(src, dst), dst.ptr()); + break; + case Mode::ROUND_SHR_SATURATE_IXxI8xI16: + on_round_shr_saturate_iXxi8xi16(make_elemwise_op_param<2>(src, dst), + dst.ptr()); + break; + ON_QUANTIZED_MODE(RELU, 1); + ON_QUANTIZED_MODE(ABS, 1); + ON_QUANTIZED_MODE(ACOS, 1); + ON_QUANTIZED_MODE(ASIN, 1); + ON_QUANTIZED_MODE(CEIL, 1); + ON_QUANTIZED_MODE(COS, 1); + ON_QUANTIZED_MODE(EXP, 1); + ON_QUANTIZED_MODE(EXPM1, 1); + ON_QUANTIZED_MODE(FLOOR, 1); + ON_QUANTIZED_MODE(LOG, 1); + ON_QUANTIZED_MODE(LOG1P, 1); + ON_QUANTIZED_MODE(NEGATE, 1); + ON_QUANTIZED_MODE(SIGMOID, 1); + ON_QUANTIZED_MODE(SIN, 1); + ON_QUANTIZED_MODE(TANH, 1); + ON_QUANTIZED_MODE(FAST_TANH, 1); + ON_QUANTIZED_MODE(ROUND, 1); + ON_QUANTIZED_MODE(ERF, 1); + ON_QUANTIZED_MODE(ERFINV, 1); + ON_QUANTIZED_MODE(ERFC, 1); + ON_QUANTIZED_MODE(ERFCINV, 1); + ON_QUANTIZED_MODE(H_SWISH, 1); + + ON_QUANTIZED_MODE(ABS_GRAD, 2); + ON_QUANTIZED_MODE(ADD, 2); + ON_QUANTIZED_MODE(FLOOR_DIV, 2); + ON_QUANTIZED_MODE(MAX, 2); + ON_QUANTIZED_MODE(MIN, 2); + ON_QUANTIZED_MODE(MOD, 2); + ON_QUANTIZED_MODE(MUL, 2); + ON_QUANTIZED_MODE(POW, 2); + ON_QUANTIZED_MODE(SIGMOID_GRAD, 2); + ON_QUANTIZED_MODE(SUB, 2); + ON_QUANTIZED_MODE(SWITCH_GT0, 2); + ON_QUANTIZED_MODE(TANH_GRAD, 2); + ON_QUANTIZED_MODE(TRUE_DIV, 2); + ON_QUANTIZED_MODE(LOG_SUM_EXP, 2); + + ON_QUANTIZED_MODE(LT, 2); + ON_QUANTIZED_MODE(LEQ, 2); + ON_QUANTIZED_MODE(EQ, 2); + + ON_QUANTIZED_MODE(FUSE_ADD_RELU, 2); + ON_QUANTIZED_MODE(FUSE_ADD_SIGMOID, 2); + ON_QUANTIZED_MODE(FUSE_ADD_TANH, 2); + ON_QUANTIZED_MODE(FAST_TANH_GRAD, 2); + ON_QUANTIZED_MODE(ATAN2, 2); + ON_QUANTIZED_MODE(H_SWISH_GRAD, 2); + ON_QUANTIZED_MODE(FUSE_ADD_H_SWISH, 2); + + ON_QUANTIZED_MODE(FUSE_MUL_ADD3, 3); + ON_QUANTIZED_MODE(COND_LEQ_MOV, 3); + default: + megdnn_throw("invalid mode"); + } +} + +#undef ON_QUANTIZED_MODE + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/elemwise_multi_type/opr_impl_helper.h b/dnn/src/common/elemwise_multi_type/opr_impl_helper.h new file mode 100644 index 00000000..8646175a --- /dev/null +++ b/dnn/src/common/elemwise_multi_type/opr_impl_helper.h @@ -0,0 +1,85 @@ +/** + * \file dnn/src/common/elemwise_multi_type/opr_impl_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs/nn_int.h" +#include "src/common/elemwise/opr_impl_helper.h" + +namespace megdnn { + +class ElemwiseMultiTypeImplHelper : public ElemwiseMultiType, + protected ElemwiseLayoutHelper { + static void call_check_layout_and_broadcast(void* opr, + const TensorLayoutPtrArray& src, + const TensorLayout& dst) { + return static_cast(opr) + ->check_layout_and_broadcast(src, dst); + } + + template + ElemwiseOpParamN make_elemwise_op_param(const TensorNDArray& src, + const TensorND& dst) { + return ElemwiseLayoutHelper::make_elemwise_op_param( + this, call_check_layout_and_broadcast, src, dst); + } + +protected: + virtual void on_fuse_mul_add3_int16x32x32x32( + const ElemwiseOpParamN<3>& param, dt_int32* dst) = 0; + + virtual void on_fuse_mul_add3_iXxf32xf32xi8( + const ElemwiseOpParamN<3>& param, dt_int8* dst) = 0; + + virtual void on_round_shr_saturate_iXxi8xi8( + const ElemwiseOpParamN<2>& param, dt_int8* dst) = 0; + + virtual void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( + const ElemwiseOpParamN<6>& param, dt_int8* dst) = 0; + + virtual void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( + const ElemwiseOpParamN<6>& param, dt_int8* dst) = 0; + + virtual void on_round_shr_saturate_iXxi8xi16( + const ElemwiseOpParamN<2>& param, dt_int16* dst) = 0; + + virtual void on_quantized_mode(const ElemwiseOpParamN<1>& param, + const TensorND& dst, + Elemwise::Mode mode) { + MEGDNN_MARK_USED_VAR(param); + MEGDNN_MARK_USED_VAR(dst); + MEGDNN_MARK_USED_VAR(mode); + megdnn_throw("Unrealized except arm_common"); + } + + virtual void on_quantized_mode(const ElemwiseOpParamN<2>& param, + const TensorND& dst, + Elemwise::Mode mode) = 0; + + virtual void on_quantized_mode(const ElemwiseOpParamN<3>& param, + const TensorND& dst, + Elemwise::Mode mode) { + MEGDNN_MARK_USED_VAR(param); + MEGDNN_MARK_USED_VAR(dst); + MEGDNN_MARK_USED_VAR(mode); + megdnn_throw("Unrealized except arm_common"); + } + +public: + using ElemwiseMultiType::ElemwiseMultiType; + + void exec(_megdnn_in const TensorNDArray& src, + _megdnn_tensor_out dst) override final; +}; + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/eye.cpp b/dnn/src/common/eye.cpp new file mode 100644 index 00000000..fcec541f --- /dev/null +++ b/dnn/src/common/eye.cpp @@ -0,0 +1,28 @@ +/** + * \file dnn/src/common/eye.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void Eye::check_exec(const TensorLayout &dst, size_t workspace_in_bytes) +{ + megdnn_assert(dst.ndim == 2 && dst.dtype.enumv() == param().dtype); + megdnn_assert_contiguous(dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/flag_warn.cpp b/dnn/src/common/flag_warn.cpp new file mode 100644 index 00000000..06e81b3e --- /dev/null +++ b/dnn/src/common/flag_warn.cpp @@ -0,0 +1,19 @@ +/** + * \file dnn/src/common/flag_warn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/config/config.h" + +#if !MEGDNN_ENABLE_MANGLING + #pragma message "Mangling is disabled." +#endif // MEGDNN_ENABLE_MANGLING + + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/flip.cpp b/dnn/src/common/flip.cpp new file mode 100644 index 00000000..c9e08c7e --- /dev/null +++ b/dnn/src/common/flip.cpp @@ -0,0 +1,57 @@ +/** + * \file dnn/src/common/flip.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void FlipBase::deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst) +{ + auto errmsg = [&]() { return megdnn_layout_msg(src); }; + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert(src.ndim == 4_z && (src.shape[3] == 1_z || + src.shape[3] == 3_z), "%s", errmsg().c_str()); + + size_t in = src.shape[0]; + size_t ih = src.shape[1]; + size_t iw = src.shape[2]; + size_t ic = src.shape[3]; + + dst = TensorLayout(TensorShape({in, ih, iw, ic}), src.dtype); +} + +void FlipBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_shape(dst_expected, dst); +} + +void Flip::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void Flip::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/gaussian_blur.cpp b/dnn/src/common/gaussian_blur.cpp new file mode 100644 index 00000000..6aa1ce35 --- /dev/null +++ b/dnn/src/common/gaussian_blur.cpp @@ -0,0 +1,58 @@ +/** + * \file dnn/src/common/gaussian_blur.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" +#include "src/common/utils.h" +#include "src/common/cv/common.h" +#include "src/common/cv/helper.h" + +namespace megdnn { + +void GaussianBlurBase::deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst) +{ + auto errmsg = [&]() { return megdnn_layout_msg(src); }; + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert(src.ndim == 4_z && (src.shape[3] == 1_z || + src.shape[3] == 3_z), "%s", errmsg().c_str()); + + size_t in = src.shape[0]; + size_t ih = src.shape[1]; + size_t iw = src.shape[2]; + size_t ic = src.shape[3]; + + dst = TensorLayout(TensorShape({in, ih, iw, ic}), src.dtype); +} + +void GaussianBlurBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_shape(dst_expected, dst); +} + +void GaussianBlur::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void GaussianBlur::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/gaussian_blur_helper.h b/dnn/src/common/gaussian_blur_helper.h new file mode 100644 index 00000000..06e63e68 --- /dev/null +++ b/dnn/src/common/gaussian_blur_helper.h @@ -0,0 +1,100 @@ +/** + * \file dnn/src/common/gaussian_blur_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/cv/common.h" +#include "src/common/utils.h" + +#pragma once + +namespace megdnn { +namespace megcv { +namespace gaussian_blur { + +template +inline static Mat getGaussianKernel(size_t n, double sigma) { + const int SMALL_GAUSSIAN_SIZE = 7; + static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = { + {1.f}, + {0.25f, 0.5f, 0.25f}, + {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, + {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, + 0.03125f}}; + + const float* fixed_kernel = + n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 + ? small_gaussian_tab[n >> 1] + : 0; + + Mat kernel(1, n, 1); + + T* c = kernel.ptr(); + + double sigmaX = sigma > 0 ? sigma : ((n - 1) * 0.5 - 1) * 0.3 + 0.8; + double scale2X = -0.5 / (sigmaX * sigmaX); + double sum = 0; + + int i; + for (i = 0; i < (int)n; i++) { + double x = i - (n - 1) * 0.5; + double t = fixed_kernel ? (double)fixed_kernel[i] + : std::exp(scale2X * x * x); + { + c[i] = (T)t; + sum += c[i]; + } + } + + sum = 1. / sum; + for (i = 0; i < (int)n; i++) + c[i] = (T)(c[i] * sum); + + return kernel; +} + +template +inline static void createGaussianKernels(Mat& kx, Mat& ky, Size ksize, + double sigma1, double sigma2) { + if (sigma2 <= 0) + sigma2 = sigma1; + + if (ksize.cols() <= 0 && sigma1 > 0) { + double num = + sigma1 * (std::is_same::value ? 3 : 4) * 2 + + 1; + num = (int)(num + (num >= 0 ? 0.5 : -0.5)); + ksize.cols() = ((int)num) | 1; + } + if (ksize.rows() <= 0 && sigma2 > 0) { + double num = + sigma2 * (std::is_same::value ? 3 : 4) * 2 + + 1; + num = (int)(num + (num >= 0 ? 0.5 : -0.5)); + ksize.rows() = ((int)num) | 1; + } + + megdnn_assert(ksize.cols() > 0 && ksize.cols() % 2 == 1 && + ksize.rows() > 0 && ksize.rows() % 2 == 1); + + sigma1 = std::max(sigma1, 0.); + sigma2 = std::max(sigma2, 0.); + + kx = getGaussianKernel(ksize.cols(), sigma1); + if (ksize.rows() == ksize.cols() && std::abs(sigma1 - sigma2) < DBL_EPSILON) + ky = kx; + else + ky = getGaussianKernel(ksize.rows(), sigma2); +} + +} // namespace gaussian_blur +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/group_local.cpp b/dnn/src/common/group_local.cpp new file mode 100644 index 00000000..ca668c20 --- /dev/null +++ b/dnn/src/common/group_local.cpp @@ -0,0 +1,103 @@ +/** + * \file dnn/src/common/group_local.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs/nn.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void GroupLocalBase::deduce_layout_fwd(const TensorLayout &src, + const TensorLayout &filter, + TensorLayout &dst) +{ + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + + megdnn_layout_msg(filter) + ", " + + megdnn_layout_msg(dst) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param().stride_h) + ", " + + megdnn_mangle("stride_w=") + std::to_string(param().stride_w); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); + megdnn_assert(param().mode == Mode::CROSS_CORRELATION, + "only CROSS_CORRELATION mode is supported for glocal."); + + megdnn_assert(param().sparse == Param::Sparse::DENSE && + param().dilate_h == 1 && param().dilate_w == 1 && + src.dtype.category() == DTypeCategory::FLOAT && + src.dtype == dst.dtype, + "unsupported conv param for Local opr"); + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(filter.ndim == 7_z, "%s", errmsg().c_str()); + size_t group = filter[0]; + size_t n = src[0]; + size_t ic = src[1]; + size_t ih = src[2]; + size_t iw = src[3]; + size_t oc = filter[6]*group; + size_t oh = filter[1], ow = filter[2]; + megdnn_assert_eq_size_t(filter[0], group); + megdnn_assert_eq_size_t(filter[3]*group, ic); + size_t fh = filter[4], fw = filter[5]; + // (group, oh, ow, ic/group, fh, fw, oc/group) + infer_conv_shape2d(ih, iw, fh, fw, + param().stride_h, param().stride_w, + param().pad_h, param().pad_w, oh, ow); + dst = TensorLayout(TensorShape({n, oc, oh, ow}), src.dtype); +} + +void GroupLocalBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) +{ + TensorLayout dst_expected{dst.dtype}; + megdnn_assert_eq_dtype(src, filter); + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + megdnn_assert(src.dtype == dtype::Float32() || MEGDNN_FLOAT16_SELECT(src.dtype == dtype::Float16(), true)); +} + +void GroupLocalForward::check_exec(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, filter, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void GroupLocalBackwardData::check_exec(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(grad, filter, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(filter, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void GroupLocalBackwardFilter::check_exec(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/handle.cpp b/dnn/src/common/handle.cpp new file mode 100644 index 00000000..d9333a9a --- /dev/null +++ b/dnn/src/common/handle.cpp @@ -0,0 +1,159 @@ +/** + * \file dnn/src/common/handle.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/basic_types.h" + +#include "src/common/handle_impl.h" +#include "src/common/utils.h" +#include "src/fallback/handle.h" +#include "src/naive/handle.h" + +#include "midout.h" + +#if MEGDNN_X86 +#include "src/x86/handle.h" +#endif + + +#if MEGDNN_WITH_CUDA +#include "src/cuda/handle.h" +#endif + + +using namespace megdnn; + +MIDOUT_DECL(HandlePlatform); +MIDOUT_DECL(HandleOpr); + +Handle::Handle(megcoreComputingHandle_t computing_handle, HandleType type) + : m_computing_handle(computing_handle), m_handle_type(type) {} + +std::unique_ptr Handle::make(megcoreComputingHandle_t computing_handle, + int debug_level) { + (void)debug_level; + megcoreDeviceHandle_t device_handle; + megcorePlatform_t platform; + megcoreGetDeviceHandle(computing_handle, &device_handle); + + megcoreGetPlatform(device_handle, &platform); + if (platform == megcorePlatformCPU) { + // only enable midout for CPU, becuase CPU might be unused when some + // other platforms are used + MIDOUT_BEGIN(HandlePlatform, midout_iv(megcorePlatformCPU)) { + // CPU +#if MEGDNN_NAIVE + return make_unique(computing_handle); +#else + if (debug_level == 0) { +#if MEGDNN_X86 + // Because of ICC bug, we cannot use make_unique here. It will + // trigger an internal compiler error. + return std::unique_ptr( + new x86::HandleImpl(computing_handle)); + // return make_unique(computing_handle); +#else + return make_unique(computing_handle); +#endif + } else if (debug_level == 1) { + return make_unique(computing_handle); + } else if (debug_level == 2) { + return make_unique(computing_handle); + } else { + megdnn_throw(megdnn_mangle("Debug level must be 0/1/2.")); + } + } + MIDOUT_END(); +#endif + } + else { + // CUDA + megdnn_assert_internal(platform == megcorePlatformCUDA); +#if MEGDNN_WITH_CUDA + return make_unique(computing_handle); +#else + return nullptr; +#endif + } + } + + + void Handle::set_destructor(const thin_function& d) { + megdnn_assert(!m_destructor, "destructor can be set only once"); + m_destructor = d; + } + + Handle::~Handle() { + if (m_destructor) + m_destructor(); + m_alive_magic = 0; + } + + size_t Handle::alignment_requirement() const { + // default to 32 + return 32; + } + + size_t Handle::image2d_pitch_alignment() const { + megdnn_throw("image2d tensor format not supported on this handle"); + } + + bool Handle::check_cross_dev_copy_constraint(const TensorLayout& src) { + return src.is_contiguous(); + } + + void Handle::on_opr_destructed(OperatorBase * opr) { + if (m_alive_magic != ALIVE_MAGIC) { + megdnn_log_error( + "Handle is destructed before opr gets destructed. " + "Please fix the destruction order as this would cause " + "undefined memory access. " + "Abort now to avoid further problems."); + abort(); + } + if (m_on_opr_destructed) { + m_on_opr_destructed(opr); + } + } + + OperatorBase::~OperatorBase() { m_handle->on_opr_destructed(this); } + + template + std::unique_ptr Handle::create_operator() { +#define CASE(etype, nm) \ + case HandleType::etype: { \ + MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::etype)) { \ + return static_cast(this)->create_operator(); \ + } \ + MIDOUT_END(); \ + } + + switch (m_handle_type) { + CASE(NAIVE, naive); +#if !MEGDNN_NAIVE + CASE(FALLBACK, fallback); +#if MEGDNN_X86 + CASE(X86, x86); +#endif +#endif // !MEGDNN_NAIVE +#if MEGDNN_WITH_CUDA + CASE(CUDA,cuda); +#endif + default: + megdnn_throw(megdnn_mangle("bad handle type")); + } +#undef CASE + } + +#define INST(opr) template std::unique_ptr Handle::create_operator(); + MEGDNN_FOREACH_OPR_CLASS(INST) +#undef INST +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/handle_impl.h b/dnn/src/common/handle_impl.h new file mode 100644 index 00000000..2e6ec73f --- /dev/null +++ b/dnn/src/common/handle_impl.h @@ -0,0 +1,209 @@ +/** + * \file dnn/src/common/handle_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/handle.h" +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +#include + +namespace megdnn { + +class HandleImplHelper : public Handle { +public: + using Handle::Handle; + + //! global matmul opr + virtual MatrixMul* matmul_opr() { + megdnn_throw("Unimplement matmul opr.\n"); + } + + //! global matmul opr with first operand transposed + virtual MatrixMul* matmul_aT_opr() { + megdnn_throw("Unimplement matmul_aT opr.\n"); + } + + //! global matmul opr with second operand transposed + virtual MatrixMul* matmul_bT_opr() { + megdnn_throw("Unimplement matmul_bT opr.\n"); + } + + //! global matmul opr with both operand transposed + virtual MatrixMul* matmul_aT_bT_opr() { + megdnn_throw("Unimplement matmul_aT_bT opr.\n"); + } + + //! global relayout opr + virtual Relayout* relayout_opr() { + megdnn_throw("Unimplement Relayout opr.\n"); + } + + virtual Checksum* checksum_opr() { + megdnn_throw("Unimplement Checksum opr.\n"); + } + + virtual MaxTensorDiff* max_tensor_diff_opr() { + megdnn_throw("Unimplement MaxTensorDiff opr.\n"); + } + +protected: + static constexpr size_t NR_HELPER_OPRS = 7; + + template + static Opr* get_helper_opr(Self self, + const typename Opr::Param& param = {}) { + static_assert(idx < NR_HELPER_OPRS, "invalid idx"); + if (!self->m_helper_oprs[idx]) { + std::lock_guard lg{self->m_helper_oprs_mtx}; + if (!self->m_helper_oprs[idx]) { + self->m_helper_oprs[idx] = + self->template create_operator(); + auto ret = static_cast(self->m_helper_oprs[idx].get()); + ret->param() = param; + megdnn_assert(ret->is_thread_safe()); + return ret; + } + } + return static_cast(self->m_helper_oprs[idx].get()); + } + +private: + std::array, NR_HELPER_OPRS> m_helper_oprs; + std::mutex m_helper_oprs_mtx; +}; + +} // namespace megdnn +/*! + * \brief iterate though each operator class name; useful for explicit + * instantialization of create_operator<> templates + */ +#define MEGDNN_FOREACH_OPR_CLASS(cb) \ + cb(ConvolutionForward) \ + cb(ConvolutionBackwardData) \ + cb(ConvolutionBackwardFilter) \ + cb(ConvPoolingForward) \ + cb(ConvBiasForward) \ + cb(Images2NeibsForward) \ + cb(Images2NeibsBackward) \ + cb(ElemwiseForward) \ + cb(ElemwiseMultiType) \ + cb(AddUpdateForward) \ + cb(RelayoutForward) \ + cb(PoolingForward) \ + cb(PoolingBackward) \ + cb(LocalForward) \ + cb(LocalBackwardData) \ + cb(LocalBackwardFilter) \ + cb(LRNForward) \ + cb(LRNBackward) \ + cb(ROIPoolingForward) \ + cb(ROIPoolingBackward) \ + cb(WarpPerspectiveForward) \ + cb(WarpPerspectiveBackwardData) \ + cb(WarpPerspectiveBackwardMat) \ + cb(DotForward) \ + cb(MatrixInverse) \ + cb(MatrixMulForward) \ + cb(BatchedMatrixMulForward) \ + cb(SVDForward) \ + cb(ReduceForward) \ + cb(CondTake) \ + cb(CumsumForward) \ + cb(ArgmaxForward) \ + cb(ArgminForward) \ + cb(TransposeForward) \ + cb(ConcatForward) \ + cb(SplitForward) \ + cb(TileForward) \ + cb(TileBackward) \ + cb(RepeatForward) \ + cb(RepeatBackward) \ + cb(ArgsortForward) \ + cb(ArgsortBackward) \ + cb(TypeCvt) \ + cb(IndexingRemapForward) \ + cb(IndexingRemapBackward) \ + cb(ChecksumForward) \ + cb(IndexingOneHotForward) \ + cb(IndexingSetOneHotForward) \ + cb(IndexingMultiAxisVec) \ + cb(IndexingSetMultiAxisVec) \ + cb(IndexingIncrMultiAxisVec) \ + cb(MeshIndexing) \ + cb(IncrMeshIndexing) \ + cb(SetMeshIndexing) \ + cb(BatchedMeshIndexing) \ + cb(BatchedIncrMeshIndexing) \ + cb(BatchedSetMeshIndexing) \ + cb(Linspace) \ + cb(Eye) \ + cb(SleepForward) \ + cb(UniformRNG) \ + cb(GaussianRNG) \ + cb(SeparableConvForward) \ + cb(SeparableFilterForward) \ + cb(BNForward) \ + cb(BNBackward) \ + cb(GroupLocalForward) \ + cb(GroupLocalBackwardData) \ + cb(GroupLocalBackwardFilter) \ + cb(Flip) \ + cb(Rotate) \ + cb(ROICopy) \ + cb(CvtColor) \ + cb(WarpAffine) \ + cb(GaussianBlur) \ + cb(Resize) \ + cb(ResizeBackward) \ + cb(ParamPackConcat) \ + cb(ParamPackSplit) \ + cb(MaxTensorDiff) \ + cb(MaskConvForward) \ + cb(MaskPropagate) \ + cb(Convolution3DForward) \ + cb(Convolution3DBackwardData) \ + cb(Convolution3DBackwardFilter) \ + cb(DeformableConvForward) \ + cb(DeformableConvBackwardFilter) \ + cb(DeformableConvBackwardData) \ + cb(DeformablePSROIPoolingForward) \ + cb(DeformablePSROIPoolingBackward) \ + cb(RelayoutFormat) \ + cb(TopK) \ + cb(PowC) \ + cb(WinogradFilterPreprocess) \ + cb(LocalShareForward) \ + cb(LocalShareBackwardData) \ + cb(LocalShareBackwardFilter) \ + cb(ROIAlignForward) \ + cb(ROIAlignBackward) \ + cb(BatchConvBiasForward) \ + +/*! + * \brief specialize HandleImpl::create_operator for a single opr type; + * implemented by Impl class + */ +#define MEGDNN_SPECIALIZE_CREATE_OPERATOR(opr) \ + template <> \ + std::unique_ptr HandleImpl::create_operator() { \ + return megdnn::make_unique(this); \ + } + +/*! + * \brief for explicit instantiation for HandleImpl::create_operator methods + */ +#define MEGDNN_INST_CREATE_OPERATOR(opr) \ + template std::unique_ptr HandleImpl::create_operator(); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/images2neibs.cpp b/dnn/src/common/images2neibs.cpp new file mode 100644 index 00000000..c80ab893 --- /dev/null +++ b/dnn/src/common/images2neibs.cpp @@ -0,0 +1,83 @@ +/** + * \file dnn/src/common/images2neibs.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src, + TensorLayout &dst) +{ + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + + std::to_string(param().stride_h) + ", " + + megdnn_mangle("stride_w=") + + std::to_string(param().stride_w) + ", " + + megdnn_mangle("window_h=") + + std::to_string(param().window_h) + ", " + + megdnn_mangle("window_w=") + + std::to_string(param().window_w); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + size_t n = src[0], ic = src[1], ih = src[2], iw = src[3]; + size_t ph = this->param().pad_h; + size_t pw = this->param().pad_w; + size_t sh = this->param().stride_h; + size_t sw = this->param().stride_w; + size_t wh = this->param().window_h; + size_t ww = this->param().window_w; + size_t oh, ow; + + infer_conv_shape2d(ih, iw, wh, ww, sh, sw, ph, pw, oh, ow); + dst = TensorLayout(TensorShape({n, ic, oh, ow, wh, ww}), src.dtype); +} + +void Images2NeibsBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + TensorLayout dst_expected; + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); +} + +void Images2NeibsForward::deduce_layout(const TensorLayout &src, + TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void Images2NeibsForward::check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void Images2NeibsBackward::check_exec(const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(grad, diff); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/indexing_multi_axis_vec.cpp b/dnn/src/common/indexing_multi_axis_vec.cpp new file mode 100644 index 00000000..31173241 --- /dev/null +++ b/dnn/src/common/indexing_multi_axis_vec.cpp @@ -0,0 +1,228 @@ +/** + * \file dnn/src/common/indexing_multi_axis_vec.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +using namespace megdnn; + +namespace { + size_t get_index_size_for_workspace( + const TensorShape &shp, const size_t *axes, size_t nr_axes) { + size_t idx_axis = axes[0]; + megdnn_assert(shp.ndim && nr_axes); + for (size_t i = 1; i < nr_axes; ++ i) { + megdnn_assert(axes[i] > axes[i - 1]); + if (axes[i] != axes[i - 1] + 1) { + idx_axis = 0; + break; + } + } + megdnn_assert(shp.ndim > idx_axis, + "index on the %zuth axis; but shape is %s", + idx_axis, shp.to_string().c_str()); + return shp.shape[idx_axis]; + } +} // anonymous namespace + +IndexingMultiAxisVecBase::IndexDescLayoutOnly +IndexingMultiAxisVecBase::extract_index_layout(const IndexDesc &index) { + IndexDescLayoutOnly ret(index.size()); + for (size_t i = 0; i < index.size(); ++ i) { + ret[i].layout = index[i].vec.layout; + ret[i].axis = index[i].axis; + } + return ret; +} + +size_t IndexingMultiAxisVecBase::deduce_layout_fwd( + const TensorLayout &data, + const IndexDescLayoutOnly &index, + TensorLayout &dst) { + megdnn_assert(!index.empty()); + megdnn_assert(data.ndim >= index.size()); + dst.ndim = data.ndim - index.size() + 1; + dst.shape[0] = 1; + dst.dtype = data.dtype; + + auto brdcast = [&](const TensorLayout &ly) { + if (ly.ndim != 1) + return false; + if (dst.shape[0] == ly.shape[0]) + return true; + if (dst.shape[0] == 1) { + dst.shape[0] = ly.shape[0]; + return true; + } + return ly.shape[0] == 1; + }; + + size_t dst_axis = 1; + ptrdiff_t prev_axis = -1; + for (size_t axis = 0; axis < index.size(); ++ axis) { + auto &&idx = index[axis]; + megdnn_assert(idx.layout.dtype == dtype::Int32(), + "invalid index dtype: %s", idx.layout.dtype.name()); + megdnn_assert(idx.axis < data.ndim && + static_cast(idx.axis) > prev_axis, + "index %zu requests invalid axis %zu", axis, idx.axis); + auto brd_succ = brdcast(idx.layout); + megdnn_assert(brd_succ, "invalid layout at index %zu: %s", + axis, idx.layout.to_string().c_str()); + + for (size_t i = prev_axis + 1; i < idx.axis; ++ i) { + dst.shape[dst_axis ++] = data.shape[i]; + } + prev_axis = idx.axis; + } + for (size_t i = prev_axis + 1; i < data.ndim; ++ i) { + dst.shape[dst_axis ++] = data.shape[i]; + } + megdnn_assert(dst_axis == dst.ndim); + + size_t idx_axis = 0; + { + // fix idx_axis if index contains consecutive axes + bool contig_idx = true; + for (size_t i = 1; i < index.size(); ++ i) { + if (index[i].axis != index[i - 1].axis + 1) { + contig_idx = false; + break; + } + } + if (contig_idx) { + auto shp0 = dst.shape[0]; + idx_axis = index[0].axis; + for (size_t i = 0; i < idx_axis; ++ i) { + dst.shape[i] = dst.shape[i + 1]; + } + dst.shape[idx_axis] = shp0; + } + } + + dst.init_contiguous_stride(); + return idx_axis; +} + +size_t IndexingMultiAxisVecBase::get_nonindex_axes( + size_t src_ndim, const IndexDesc &index, size_t *out) { + auto iter = index.begin(); + size_t nr = 0; + for (size_t i = 0; i < src_ndim; ++ i) { + if (iter != index.end() && i == iter->axis) { + ++ iter; + } else { + out[nr ++] = i; + } + } + megdnn_assert(nr + index.size() == src_ndim && iter == index.end()); + return nr; +} + +IndexingMultiAxisVecBase::ExecInfo +IndexingMultiAxisVecBase::check_exec_noworkspace( + const TensorLayout &data, const TensorLayout &value, + const IndexDesc &index, IndexDescLayoutOnly &index_layout) { + + ExecInfo ret; + index_layout = extract_index_layout(index); + TensorLayout value_expect; + ret.idx_axis = deduce_layout_fwd(data, index_layout, value_expect); + megdnn_assert_eq_shape(value_expect, value); + + auto value_contig = value.collapse_contiguous(); + megdnn_assert(value_contig.ndim == 1, + "value layout must be 1-dim contiguous; got %s", + value.to_string().c_str()); + + ret.value_stride = value_contig.stride[0]; + return ret; +} + +std::pair +IndexingMultiAxisVecBase::get_value_iter_optimized_layout( + const TensorLayout &data, const TensorLayout &value, + const IndexDesc &index, size_t idx_axis) { + size_t data_axes[TensorLayout::MAX_NDIM], + nr_axes = get_nonindex_axes(data.ndim, index, data_axes); + + megdnn_assert(nr_axes == value.ndim - 1 && idx_axis < value.ndim && + nr_axes + index.size() == data.ndim); + + TensorLayout ret; + if (idx_axis) { + ret.ndim = idx_axis; + for (size_t i = 0; i < idx_axis; ++ i) { + ret.shape[i] = data.shape[data_axes[i]]; + ret.stride[i] = data.stride[data_axes[i]]; + } + ret = ret.collapse_contiguous(); + } + ret.shape[ret.ndim] = value.shape[idx_axis]; + ret.stride[ret.ndim] = 0; + size_t ret_idx_axis = ret.ndim; + ++ ret.ndim; + + if (idx_axis < nr_axes) { + TensorLayout tail; + tail.ndim = nr_axes - idx_axis; + for (size_t i = idx_axis; i < nr_axes; ++ i) { + tail.shape[i - idx_axis] = data.shape[data_axes[i]]; + tail.stride[i - idx_axis] = data.stride[data_axes[i]]; + } + tail = tail.collapse_contiguous(); + for (size_t i = 0; i < tail.ndim; ++ i) { + ret.shape[ret.ndim] = tail.shape[i]; + ret.stride[ret.ndim] = tail.stride[i]; + ++ ret.ndim; + } + } + + return {ret, ret_idx_axis}; +} + +size_t IndexingMultiAxisVec::get_workspace_in_bytes( + const TensorShape &dst, const size_t *axes, size_t nr_axes) { + return get_workspace_in_bytes( + get_index_size_for_workspace(dst, axes, nr_axes)); +} + +IndexingMultiAxisVec::ExecInfo IndexingMultiAxisVec::check_exec( + const TensorLayout &src, const IndexDesc &index, + const TensorLayout &dst, size_t workspace_in_bytes) { + IndexDescLayoutOnly index_layout; + auto ret = check_exec_noworkspace(src, dst, index, index_layout); + megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes( + dst.shape[ret.idx_axis])); + megdnn_assert(ret.value_stride, "dst must be non-overlapping"); + return ret; +} + +size_t IndexingModifyMultiAxisVecBase::get_workspace_in_bytes( + const TensorShape &value, const size_t *axes, size_t nr_axes) { + return get_workspace_in_bytes( + get_index_size_for_workspace(value, axes, nr_axes)); +} + +IndexingModifyMultiAxisVecBase::ExecInfo +IndexingModifyMultiAxisVecBase::check_exec( + const TensorLayout &data, const TensorLayout &value, + const IndexDesc &index, size_t workspace_in_bytes) { + megdnn_assert(data.is_non_overlapping_strong(), + "data layout should not overlap: %s", data.to_string().c_str()); + IndexDescLayoutOnly index_layout; + auto ret = check_exec_noworkspace(data, value, index, index_layout); + megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes( + value.shape[ret.idx_axis])); + return ret; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/indexing_multi_axis_vec_kdef.h b/dnn/src/common/indexing_multi_axis_vec_kdef.h new file mode 100644 index 00000000..ddf5c960 --- /dev/null +++ b/dnn/src/common/indexing_multi_axis_vec_kdef.h @@ -0,0 +1,51 @@ +/** + * \file dnn/src/common/indexing_multi_axis_vec_kdef.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/arch.h" + +#if MEGDNN_CC_HOST && !defined(__device__) +#define __device__ +#define def_device 1 +#endif + +namespace megdnn { +namespace indexing_multi_axis_vec_kdef { + +struct OprFwd { + template + __device__ static void apply(ctype data, ctype &value) { + value = data; + } +}; + +struct OprSet { + template + __device__ static void apply(ctype &data, ctype value) { + data = value; + } +}; + +struct OprIncr { + template + __device__ static void apply(ctype &data, ctype value) { + data += value; + } +}; + +} +} + +#if def_device +#undef __device__ +#undef def_device +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/indexing_one_hot.cpp b/dnn/src/common/indexing_one_hot.cpp new file mode 100644 index 00000000..78a3b7f9 --- /dev/null +++ b/dnn/src/common/indexing_one_hot.cpp @@ -0,0 +1,83 @@ +/** + * \file dnn/src/common/indexing_one_hot.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +using namespace megdnn; + +void IndexingOneHotBase::deduce_layout_fwd( + const TensorLayout &src, const TensorLayout &index, + TensorLayout &dst) { + megdnn_assert( + m_param.axis < static_cast(src.ndim) && src.ndim >= 2, + "IndexingOneHot on axis %u, but input has only %zu dims", + m_param.axis, src.ndim); + MEGDNN_MARK_USED_VAR(index); + dst = src; + dst.shape[m_param.axis] = 1; + dst.init_contiguous_stride(); +} + +void IndexingOneHotBase::check_layout_fwd( + const TensorLayout &src, const TensorLayout &index, + const TensorLayout &dst) { + auto errmsg = [&]() -> std::string { + return megdnn_mangle(ssprintf("bad layout for IndexingOneHot: " + "src=%s index=%s dst=%s axis=%d", + src.to_string().c_str(), index.to_string().c_str(), + dst.to_string().c_str(), m_param.axis)); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_eq_dtype(src, dst); + megdnn_assert(index.dtype == dtype::Int32(), "%s", errmsg().c_str()); + megdnn_assert(src.is_contiguous() && index.is_contiguous() && + dst.is_contiguous(), "%s", errmsg().c_str()); + + // check index + TensorShape idx_shp{src}; + -- idx_shp.ndim; + megdnn_assert(m_param.axis >= 0, "%s", errmsg().c_str()); + for (auto i = static_cast(m_param.axis); i < idx_shp.ndim; ++i) + idx_shp[i] = idx_shp[i + 1]; + megdnn_assert(index.eq_shape(idx_shp), "%s idx_shp=%s", errmsg().c_str(), idx_shp.to_string().c_str()); + + // check dst + megdnn_assert( + m_param.axis < static_cast(src.ndim) && src.ndim >= 2, + "%s", errmsg().c_str()); + TensorShape dst_shp{src}; + dst_shp.shape[m_param.axis] = 1; + megdnn_assert(dst.eq_shape(dst_shp), "%s dst_shp=%s", errmsg().c_str(), dst_shp.to_string().c_str()); +} + +void IndexingOneHotForward::check_exec(const TensorLayout &src, + const TensorLayout &index, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, index, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes( + src, index, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void IndexingSetOneHotForward::check_exec(const TensorLayout &data, + const TensorLayout &index, const TensorLayout &sub, + size_t workspace_in_bytes) +{ + check_layout_fwd(data, index, sub); + auto required_workspace_in_bytes = get_workspace_in_bytes( + data, index, sub); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/linspace.cpp b/dnn/src/common/linspace.cpp new file mode 100644 index 00000000..d716237d --- /dev/null +++ b/dnn/src/common/linspace.cpp @@ -0,0 +1,27 @@ +/** + * \file dnn/src/common/linspace.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void Linspace::check_exec(const TensorLayout &dst, size_t workspace_in_bytes) +{ + megdnn_assert(dst.ndim == 1 && dst.shape[0] > 0); + megdnn_assert_contiguous(dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/local/local_decl.inl b/dnn/src/common/local/local_decl.inl new file mode 100644 index 00000000..71a0e86d --- /dev/null +++ b/dnn/src/common/local/local_decl.inl @@ -0,0 +1,34 @@ +/** + * \file dnn/src/common/local/local_decl.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// simd_macro/*_helper.h should be included before including this file. +// +// The following functions would be declared in this file: +// +// void local_xcorr_MEGDNN_SIMD_NAME(const LocalKParam &kparam); +// void local_conv_MEGDNN_SIMD_NAME(const LocalKParam &kparam); +// +#include "src/naive/local/opr_impl.h" + +#include "src/common/macro_helper.h" + +namespace megdnn { + +using LocalKParam = naive::LocalForwardImpl::FloatNoncontigBatchKernParam; + +void WITH_SIMD_SUFFIX(local_xcorr)( + const LocalKParam ¶m) MEGDNN_SIMD_ATTRIBUTE_TARGET; + +void WITH_SIMD_SUFFIX(local_conv)( + const LocalKParam ¶m) MEGDNN_SIMD_ATTRIBUTE_TARGET; + +} // namespace megdnn + +#include "src/common/macro_helper_epilogue.h" diff --git a/dnn/src/common/local/local_def.inl b/dnn/src/common/local/local_def.inl new file mode 100644 index 00000000..13b8759c --- /dev/null +++ b/dnn/src/common/local/local_def.inl @@ -0,0 +1,425 @@ +/** + * \file dnn/src/common/local/local_def.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// simd_macro/*_helper.h should be included before including this file. +// +// The following functions would be defined in this file: +// +// void local_xcorr_MEGDNN_SIMD_NAME(const LocalKParam &kparam); +// void local_conv_MEGDNN_SIMD_NAME(const LocalKParam &kparam); +// + +#include "src/common/local/local_decl.inl" + +#include "src/common/utils.h" +#include "src/common/macro_helper.h" + +namespace { + +using namespace megdnn; + +template +void local_xcorr_tpl(const LocalKParam &kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET; +template +void local_xcorr_tpl(const LocalKParam &kparam) +{ + const float* src = static_cast(kparam.src); + const float* filter = static_cast(kparam.filter); + float* dst = static_cast(kparam.dst); + float* workspace = static_cast(kparam.workspace); + const int IC = kparam.ic, IH = kparam.ih, IW = kparam.iw, OH = kparam.oh, + OW = kparam.ow, FH = kparam.fh, FW = kparam.fw; + const uint32_t PH = kparam.ph, PW = kparam.pw, SH = kparam.sh, + SW = kparam.sw; + const ptrdiff_t INP_BS = kparam.inp_bs, OUT_BS = kparam.out_bs; + + float *dst2 = workspace; + const int width = MEGDNN_SIMD_WIDTH; + // dst2 is (H, W, N, C) + memset(dst2, 0, sizeof(float) * OH*OW*N*OC); + float *dst2_hwnc = dst2; + rep(oh, OH) rep(ow, OW) { + const float *src_bak = src; + rep(ic, IC) { + rep(fh, FH) for (int fw = 0; fw < FW; ++fw, filter += OC) { + int ih = -PH + oh*SH + fh; + int iw = -PW + ow*SW + fw; + if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) continue; + float *dst2_bak = dst2; + rep(n, N) { + float s = src[n*INP_BS + ih*IW + iw]; + const float *filter_bak = filter; + MEGDNN_SIMD_TYPE vs = MEGDNN_SIMD_SET1(s); + int oc = 0; + for (; oc+4*width <= OC; oc += 4*width, filter += 4*width) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vf2 = MEGDNN_SIMD_LOADU(filter + 2*width); + MEGDNN_SIMD_TYPE vf3 = MEGDNN_SIMD_LOADU(filter + 3*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + MEGDNN_SIMD_TYPE vd2 = MEGDNN_SIMD_LOADU(dst2 + oc + 2*width); + MEGDNN_SIMD_TYPE vd3 = MEGDNN_SIMD_LOADU(dst2 + oc + 3*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + vd2 = MEGDNN_SIMD_FMADD(vf2, vs, vd2); + vd3 = MEGDNN_SIMD_FMADD(vf3, vs, vd3); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 2*width, vd2); + MEGDNN_SIMD_STOREU(dst2 + oc + 3*width, vd3); + } + if (oc+2*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + oc += 2*width; + filter += 2*width; + } + if (oc+1*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + oc += 1*width; + filter += 1*width; + } + for (; oc < OC; ++oc, ++filter) { + dst2[oc] += s * (*filter); + } + filter = filter_bak; + dst2 += OC; + } + dst2 = dst2_bak; + } + src += IH*IW; + } + src = src_bak; + dst2 += N*OC; + } + transpose_knc2nsck(dst2_hwnc, dst, OH * OW, N, OC, OUT_BS); +} +void local_xcorr_generic(const LocalKParam &kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET; +void local_xcorr_generic(const LocalKParam &kparam) { + UNPACK_LOCAL_FLOAT_NONCONTIG_BATCH_KERN_PARAM(kparam, float); + + float *dst2 = workspace; + const int width = MEGDNN_SIMD_WIDTH; + // dst2 is (H, W, N, C) + memset(dst2, 0, sizeof(float) * OH*OW*N*OC); + float *dst2_hwnc = dst2; + rep(oh, OH) rep(ow, OW) { + const float *src_bak = src; + rep(ic, IC) { + rep(fh, FH) for (int fw = 0; fw < FW; ++fw, filter += OC) { + int ih = -PH + oh*SH + fh; + int iw = -PW + ow*SW + fw; + if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) continue; + float *dst2_bak = dst2; + rep(n, N) { + float s = src[n*INP_BS + ih*IW + iw]; + const float *filter_bak = filter; + MEGDNN_SIMD_TYPE vs = MEGDNN_SIMD_SET1(s); + int oc = 0; + for (; oc+4*width <= OC; oc += 4*width, filter += 4*width) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vf2 = MEGDNN_SIMD_LOADU(filter + 2*width); + MEGDNN_SIMD_TYPE vf3 = MEGDNN_SIMD_LOADU(filter + 3*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + MEGDNN_SIMD_TYPE vd2 = MEGDNN_SIMD_LOADU(dst2 + oc + 2*width); + MEGDNN_SIMD_TYPE vd3 = MEGDNN_SIMD_LOADU(dst2 + oc + 3*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + vd2 = MEGDNN_SIMD_FMADD(vf2, vs, vd2); + vd3 = MEGDNN_SIMD_FMADD(vf3, vs, vd3); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 2*width, vd2); + MEGDNN_SIMD_STOREU(dst2 + oc + 3*width, vd3); + } + if (oc+2*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + oc += 2*width; + filter += 2*width; + } + if (oc+1*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + oc += 1*width; + filter += 1*width; + } + for (; oc < OC; ++oc, ++filter) { + dst2[oc] += s * (*filter); + } + filter = filter_bak; + dst2 += OC; + } + dst2 = dst2_bak; + } + src += IH*IW; + } + src = src_bak; + dst2 += N*OC; + } + transpose_knc2nsck(dst2_hwnc, dst, OH * OW, N, OC, OUT_BS); +} + +template +void local_conv_tpl(const LocalKParam &kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET; +template +void local_conv_tpl(const LocalKParam &kparam) +{ + const float* src = static_cast(kparam.src); + const float* filter = static_cast(kparam.filter); + float* dst = static_cast(kparam.dst); + float* workspace = static_cast(kparam.workspace); + const int IC = kparam.ic, IH = kparam.ih, IW = kparam.iw, OH = kparam.oh, + OW = kparam.ow, FH = kparam.fh, FW = kparam.fw; + const uint32_t PH = kparam.ph, PW = kparam.pw, SH = kparam.sh, + SW = kparam.sw; + const ptrdiff_t INP_BS = kparam.inp_bs, OUT_BS = kparam.out_bs; + + float *dst2 = workspace; + const int width = MEGDNN_SIMD_WIDTH; + // dst2 is (H, W, N, C) + memset(dst2, 0, sizeof(float) * OH*OW*N*OC); + float *dst2_hwnc = dst2; + rep(oh, OH) rep(ow, OW) { + const float *src_bak = src; + rep(ic, IC) { + rep(fh, FH) for (int fw = 0; fw < FW; ++fw, filter += OC) { + int ih = -PH + oh*SH + (FH-fh-1); + int iw = -PW + ow*SW + (FW-fw-1); + if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) continue; + float *dst2_bak = dst2; + rep(n, N) { + float s = src[n*INP_BS + ih*IW + iw]; + const float *filter_bak = filter; + MEGDNN_SIMD_TYPE vs = MEGDNN_SIMD_SET1(s); + int oc = 0; + for (; oc+4*width <= OC; oc += 4*width, filter += 4*width) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vf2 = MEGDNN_SIMD_LOADU(filter + 2*width); + MEGDNN_SIMD_TYPE vf3 = MEGDNN_SIMD_LOADU(filter + 3*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + MEGDNN_SIMD_TYPE vd2 = MEGDNN_SIMD_LOADU(dst2 + oc + 2*width); + MEGDNN_SIMD_TYPE vd3 = MEGDNN_SIMD_LOADU(dst2 + oc + 3*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + vd2 = MEGDNN_SIMD_FMADD(vf2, vs, vd2); + vd3 = MEGDNN_SIMD_FMADD(vf3, vs, vd3); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 2*width, vd2); + MEGDNN_SIMD_STOREU(dst2 + oc + 3*width, vd3); + } + if (oc+2*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + oc += 2*width; + filter += 2*width; + } + if (oc+1*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + oc += 1*width; + filter += 1*width; + } + for (; oc < OC; ++oc, ++filter) { + dst2[oc] += s * (*filter); + } + filter = filter_bak; + dst2 += OC; + } + dst2 = dst2_bak; + } + src += IH*IW; + } + src = src_bak; + dst2 += N*OC; + } + transpose_knc2nsck(dst2_hwnc, dst, OH * OW, N, OC, OUT_BS); +} + +void local_conv_generic(const LocalKParam &kparam) MEGDNN_SIMD_ATTRIBUTE_TARGET; +void local_conv_generic(const LocalKParam &kparam) { + UNPACK_LOCAL_FLOAT_NONCONTIG_BATCH_KERN_PARAM(kparam, float); + + float *dst2 = workspace; + const int width = MEGDNN_SIMD_WIDTH; + // dst2 is (H, W, N, C) + memset(dst2, 0, sizeof(float) * OH*OW*N*OC); + float *dst2_hwnc = dst2; + rep(oh, OH) rep(ow, OW) { + const float *src_bak = src; + rep(ic, IC) { + rep(fh, FH) for (int fw = 0; fw < FW; ++fw, filter += OC) { + int ih = -PH + oh*SH + (FH-fh-1); + int iw = -PW + ow*SW + (FW-fw-1); + if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) continue; + float *dst2_bak = dst2; + rep(n, N) { + float s = src[n*INP_BS + ih*IW + iw]; + const float *filter_bak = filter; + MEGDNN_SIMD_TYPE vs = MEGDNN_SIMD_SET1(s); + int oc = 0; + for (; oc+4*width <= OC; oc += 4*width, filter += 4*width) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vf2 = MEGDNN_SIMD_LOADU(filter + 2*width); + MEGDNN_SIMD_TYPE vf3 = MEGDNN_SIMD_LOADU(filter + 3*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + MEGDNN_SIMD_TYPE vd2 = MEGDNN_SIMD_LOADU(dst2 + oc + 2*width); + MEGDNN_SIMD_TYPE vd3 = MEGDNN_SIMD_LOADU(dst2 + oc + 3*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + vd2 = MEGDNN_SIMD_FMADD(vf2, vs, vd2); + vd3 = MEGDNN_SIMD_FMADD(vf3, vs, vd3); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 2*width, vd2); + MEGDNN_SIMD_STOREU(dst2 + oc + 3*width, vd3); + } + if (oc+2*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vf1 = MEGDNN_SIMD_LOADU(filter + 1*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + MEGDNN_SIMD_TYPE vd1 = MEGDNN_SIMD_LOADU(dst2 + oc + 1*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + vd1 = MEGDNN_SIMD_FMADD(vf1, vs, vd1); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 1*width, vd1); + oc += 2*width; + filter += 2*width; + } + if (oc+1*width <= OC) { + MEGDNN_SIMD_TYPE vf0 = MEGDNN_SIMD_LOADU(filter + 0*width); + MEGDNN_SIMD_TYPE vd0 = MEGDNN_SIMD_LOADU(dst2 + oc + 0*width); + vd0 = MEGDNN_SIMD_FMADD(vf0, vs, vd0); + MEGDNN_SIMD_STOREU(dst2 + oc + 0*width, vd0); + oc += 1*width; + filter += 1*width; + } + for (; oc < OC; ++oc, ++filter) { + dst2[oc] += s * (*filter); + } + filter = filter_bak; + dst2 += OC; + } + dst2 = dst2_bak; + } + src += IH*IW; + } + src = src_bak; + dst2 += N*OC; + } + transpose_knc2nsck(dst2_hwnc, dst, OH * OW, N, OC, OUT_BS); +} + +} // anonymous namespace + +namespace megdnn { + +#define FUNC_NAME CONCAT_STR(local_xcorr_, MEGDNN_SIMD_NAME) + +void FUNC_NAME(const LocalKParam &kparam) { + auto N = kparam.n, OC = kparam.oc; +#define DISPATCH_WITH_N_OC(N, OC) do { \ + local_xcorr_tpl(kparam); \ + return; \ +} while (0) + +#define DISPATCH_WITH_N(N) \ + switch (OC) { \ + case 16: DISPATCH_WITH_N_OC(N, 16); break; \ + case 32: DISPATCH_WITH_N_OC(N, 32); break; \ + case 48: DISPATCH_WITH_N_OC(N, 48); break; \ + case 64: DISPATCH_WITH_N_OC(N, 64); break; \ + } +#define DISPATCH() \ + switch (N) { \ + case 1: DISPATCH_WITH_N(1); break; \ + case 2: DISPATCH_WITH_N(2); break; \ + } + + DISPATCH(); + +#undef DISPATCH +#undef DISPATCH_WITH_N +#undef DISPATCH_WITH_N_OC + local_xcorr_generic(kparam); +} + +#undef FUNC_NAME + + + +#define FUNC_NAME CONCAT_STR(local_conv_, MEGDNN_SIMD_NAME) + +void FUNC_NAME(const LocalKParam &kparam) { + auto N = kparam.n, OC = kparam.oc; +#define DISPATCH_WITH_N_OC(N, OC) do { \ + local_conv_tpl(kparam); \ + return; \ +} while (0) + +#define DISPATCH_WITH_N(N) \ + switch (OC) { \ + case 16: DISPATCH_WITH_N_OC(N, 16); break; \ + case 32: DISPATCH_WITH_N_OC(N, 32); break; \ + case 48: DISPATCH_WITH_N_OC(N, 48); break; \ + case 64: DISPATCH_WITH_N_OC(N, 64); break; \ + } +#define DISPATCH() \ + switch (N) { \ + case 1: DISPATCH_WITH_N(1); break; \ + case 2: DISPATCH_WITH_N(2); break; \ + } + + DISPATCH(); + +#undef DISPATCH +#undef DISPATCH_WITH_N +#undef DISPATCH_WITH_N_OC + local_conv_generic(kparam); +} + +#undef FUNC_NAME + +} // namespace megdnn + +#include "src/common/macro_helper_epilogue.h" diff --git a/dnn/src/common/local/opr_impl.cpp b/dnn/src/common/local/opr_impl.cpp new file mode 100644 index 00000000..355cb465 --- /dev/null +++ b/dnn/src/common/local/opr_impl.cpp @@ -0,0 +1,118 @@ +/** + * \file dnn/src/common/local/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void LocalBase::deduce_layout_fwd(const TensorLayout &src, + const TensorLayout &filter, TensorLayout &dst) +{ + auto errmsg = megdnn_layout_msg(src) + ", " + + megdnn_layout_msg(filter) + ", " + + megdnn_layout_msg(dst) + ", " + + megdnn_mangle("is_xcorr=") + + std::to_string((param().mode == Mode::CROSS_CORRELATION)) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param().stride_h) + ", " + + megdnn_mangle("stride_w=") + std::to_string(param().stride_w) ; + auto errmsg_c = errmsg.c_str(); + MEGDNN_MARK_USED_VAR(errmsg_c); + + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); + megdnn_assert(src.ndim == 4_z, "%s", errmsg_c); + megdnn_assert(filter.ndim == 6_z, "%s", errmsg_c); + megdnn_assert(param().dilate_h == 1 && param().dilate_w == 1, + "dilation in local not supported"); + + megdnn_assert(param().sparse == Param::Sparse::DENSE && + param().dilate_h == 1 && param().dilate_w == 1 && + src.dtype.category() == DTypeCategory::FLOAT && + dst.dtype == src.dtype && + "unsupported conv param for Local opr"); + + size_t n = src[0]; + size_t ic = src[1]; + size_t ih = src[2]; + size_t iw = src[3]; + megdnn_assert_eq_size_t(filter[2], ic); + size_t fh = filter[3]; + size_t fw = filter[4]; + size_t oc = filter[5]; + size_t sh = param().stride_h; + size_t sw = param().stride_w; + size_t ph = param().pad_h; + size_t pw = param().pad_w; + size_t oh, ow; + infer_conv_shape2d(ih, iw, fh, fw, sh, sw, ph, pw, oh, ow); + dst = TensorLayout(TensorShape({n, oc, oh, ow}), src.dtype); +} + +void LocalBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) +{ + TensorLayout dst_expected{dst.dtype}; + megdnn_assert_eq_dtype(src, filter); + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + + megdnn_assert(src.dtype == filter.dtype && src.dtype == dst.dtype); + megdnn_assert(src.dtype == dtype::Float32() || + MEGDNN_FLOAT16_SELECT(src.dtype == dtype::Float16(), true)); +} + +void LocalForward::deduce_layout(const TensorLayout &src, + const TensorLayout &filter, + TensorLayout &dst) +{ + deduce_layout_fwd(src, filter, dst); +} + +void LocalForward::check_exec(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, filter, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void LocalBackwardData::check_exec(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(grad, filter, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(filter, + diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void LocalBackwardFilter::check_exec(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, + diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/local_share/opr_impl.cpp b/dnn/src/common/local_share/opr_impl.cpp new file mode 100644 index 00000000..d4851e3f --- /dev/null +++ b/dnn/src/common/local_share/opr_impl.cpp @@ -0,0 +1,228 @@ +/** + * \file dnn/src/common/local_share/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +namespace megdnn { + +void LocalShareBase::deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) { + using Mode = LocalShare::Param::Mode; + auto errmsg = + megdnn_layout_msg(src) + ", " + megdnn_layout_msg(filter) + ", " + + megdnn_layout_msg(dst) + ", " + megdnn_mangle("is_xcorr=") + + std::to_string((param().mode == Mode::CROSS_CORRELATION)) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param().stride_h) + + ", " + megdnn_mangle("stride_w=") + + std::to_string(param().stride_w) + ", " + + megdnn_mangle("dilate_h=") + std::to_string(param().dilate_h) + + ", " + megdnn_mangle("dilate_w=") + + std::to_string(param().dilate_w) + ", " + + megdnn_mangle("spatial_groups_h=") + + std::to_string(param().spatial_groups_h) + ", " + + megdnn_mangle("spatial_groups_w=") + + std::to_string(param().spatial_groups_w); + auto errmsg_c = errmsg.c_str(); + MEGDNN_MARK_USED_VAR(errmsg_c); + + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + using Format = Param::Format; + using ComputeMode = Param::ComputeMode; + megdnn_assert(param().format == Format::NCHW, + "local shared only support NCHW format"); + megdnn_assert(src.ndim == 4_z, "%s", errmsg_c); + megdnn_assert( + (filter.ndim == 6_z && param().sparse == Sparse::DENSE) || + (filter.ndim == 7_z && param().sparse == Sparse::GROUP), + "%s", errmsg_c); + megdnn_assert(param().dilate_h == 1 && param().dilate_w == 1, + "dilated local shared is not supported"); + megdnn_assert(src.dtype == dtype::Float32() && + param().computeMode == ComputeMode::DEFAULT, + "local shared only support float32"); + + size_t n = src[0], ci = src[1], hi = src[2], wi = src[3]; + size_t sgh = param().spatial_groups_h, sgw = param().spatial_groups_w; + size_t groups = 1; + size_t weights_shp_pos = 0; + if (param().sparse == Sparse::GROUP) { + groups = filter[0]; + weights_shp_pos = 1; + } + megdnn_assert(sgh == filter[weights_shp_pos] && + sgw == filter[weights_shp_pos + 1], + "spatial groups in filter tensor mismatch with those " + "provided in parameter %s", + errmsg_c); + size_t fh = filter[weights_shp_pos + 3], fw = filter[weights_shp_pos + 4], + co = filter[weights_shp_pos + 5] * groups; + megdnn_assert(filter[weights_shp_pos + 2] * groups == ci, + "input channels of src and filter mismatch %s", errmsg_c); + size_t sh = param().stride_h; + size_t sw = param().stride_w; + size_t ph = param().pad_h; + size_t pw = param().pad_w; + size_t ho = infer_conv_shape(hi, fh, sh, ph), + wo = infer_conv_shape(wi, fw, sw, pw); + megdnn_assert( + ho % sgh == 0 && wo % sgw == 0, + "height and width of output cannot be divided by spatial groups %s", + errmsg_c); + dst = TensorLayout{{n, co, ho, wo}, src.dtype}; +} + +void LocalShareBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) { + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, filter); + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, filter, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + + megdnn_assert(src.dtype == dtype::Float32()); +} + +void LocalShareForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); +} + +void LocalShareForward::check_exec(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, filter, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void LocalShareBackwardData::deduce_layout(const TensorLayout& filter, + const TensorLayout& diff, + TensorLayout& grad) { + using Mode = LocalShare::Param::Mode; + auto errmsg = + megdnn_layout_msg(filter) + ", " + megdnn_layout_msg(diff) + ", " + + megdnn_layout_msg(grad) + ", " + megdnn_mangle("is_xcorr=") + + std::to_string((param().mode == Mode::CROSS_CORRELATION)) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param().stride_h) + + ", " + megdnn_mangle("stride_w=") + + std::to_string(param().stride_w) + ", " + + megdnn_mangle("dilate_h=") + std::to_string(param().dilate_h) + + ", " + megdnn_mangle("dilate_w=") + + std::to_string(param().dilate_w) + ", " + + megdnn_mangle("spatial_groups_h=") + + std::to_string(param().spatial_groups_h) + ", " + + megdnn_mangle("spatial_groups_w=") + + std::to_string(param().spatial_groups_w); + auto errmsg_c = errmsg.c_str(); + MEGDNN_MARK_USED_VAR(errmsg_c); + + megdnn_assert_contiguous(filter); + megdnn_assert_contiguous(diff); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + using Format = Param::Format; + using ComputeMode = Param::ComputeMode; + megdnn_assert(param().format == Format::NCHW, + "local shared only support NCHW format"); + megdnn_assert( + (filter.ndim == 6_z && param().sparse == Sparse::DENSE) || + (filter.ndim == 7_z && param().sparse == Sparse::GROUP), + "%s", errmsg_c); + megdnn_assert(diff.ndim == 4_z, "%s", errmsg_c); + megdnn_assert(param().dilate_h == 1 && param().dilate_w == 1, + "dilated local shared is not supported"); + megdnn_assert(diff.dtype == dtype::Float32() && + param().computeMode == ComputeMode::DEFAULT, + "local shared only support float32"); + + size_t n = diff[0], co = diff[1], ho = diff[2], wo = diff[3]; + size_t sgh = param().spatial_groups_h, sgw = param().spatial_groups_w; + megdnn_assert( + ho % sgh == 0 && wo % sgw == 0, + "height and width of output cannot be divided by spatial groups %s", + errmsg_c); + size_t groups = 1; + size_t weights_shp_pos = 0; + if (param().sparse == Sparse::GROUP) { + groups = filter[0]; + weights_shp_pos = 1; + } + megdnn_assert(sgh == filter[weights_shp_pos] && + sgw == filter[weights_shp_pos + 1], + "spatial groups in filter tensor mismatch with those " + "provided in parameter %s", + errmsg_c); + size_t ci = filter[weights_shp_pos + 2] * groups, + fh = filter[weights_shp_pos + 3], fw = filter[weights_shp_pos + 4]; + megdnn_assert(filter[weights_shp_pos + 5] * groups == co, + "input channels of src and filter mismatch %s", errmsg_c); + size_t sh = param().stride_h; + size_t sw = param().stride_w; + size_t ph = param().pad_h; + size_t pw = param().pad_w; + + auto deduce = [&errmsg_c](size_t out, size_t filter, size_t stride, + size_t pad) { + MEGDNN_MARK_USED_VAR(errmsg_c); + auto i = (out - 1) * stride + filter; + megdnn_assert(i > pad * 2, "%s", errmsg_c); + return i - pad * 2; + }; + grad.ndim = diff.ndim; + grad[0] = n; + grad[1] = ci; + grad[2] = deduce(ho, fh, sh, ph); + grad[3] = deduce(wo, fw, sw, pw); + grad.init_contiguous_stride(); + grad.dtype = diff.dtype; +} + +void LocalShareBackwardData::check_exec(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes) { + auto filter_dtype = filter.dtype, diff_dtype = diff.dtype, + grad_dtype = grad.dtype; + megdnn_assert(filter_dtype == dtype::Float32() && + filter_dtype == diff_dtype && filter_dtype == grad_dtype); + check_layout_fwd(grad, filter, diff); + auto required_workspace_in_bytes = + get_workspace_in_bytes(filter, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void LocalShareBackwardFilter::check_exec(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes) { + auto src_dtype = src.dtype, diff_dtype = diff.dtype, + grad_dtype = grad.dtype; + megdnn_assert(src_dtype == dtype::Float32() && src_dtype == diff_dtype && + src_dtype == grad_dtype); + check_layout_fwd(src, grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/lrn.cpp b/dnn/src/common/lrn.cpp new file mode 100644 index 00000000..c8d9d286 --- /dev/null +++ b/dnn/src/common/lrn.cpp @@ -0,0 +1,57 @@ +/** + * \file dnn/src/common/lrn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void LRNBase::check_param() +{ + megdnn_assert(param().n & 1); +} + +void LRNForward::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + dst = src; +} + +void LRNForward::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_param(); + megdnn_assert_contiguous(src); + megdnn_assert_eq_layout(src, dst); + + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void LRNBackward::check_exec(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_param(); + megdnn_assert_contiguous(src); + megdnn_assert_eq_layout(src, dst); + megdnn_assert_eq_layout(src, diff); + megdnn_assert_eq_layout(src, grad); + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst, + diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/macro_helper.h b/dnn/src/common/macro_helper.h new file mode 100644 index 00000000..5c356c79 --- /dev/null +++ b/dnn/src/common/macro_helper.h @@ -0,0 +1,24 @@ +/** + * \file dnn/src/common/macro_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#ifdef MAKE_STR +#error "macro_helper.h not used with macro_helper_epilogue.h" +#endif + +#define MAKE_STR0(v) #v +#define MAKE_STR(v) MAKE_STR0(v) + +#define CONCAT_STR0(a, b) a ## b +#define CONCAT_STR(a, b) CONCAT_STR0(a, b) + +//! add _MEGDNN_SIMD_NAME to given prefix +#define WITH_SIMD_SUFFIX(prefix) CONCAT_STR(prefix##_, MEGDNN_SIMD_NAME) + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/macro_helper_epilogue.h b/dnn/src/common/macro_helper_epilogue.h new file mode 100644 index 00000000..df52d8d0 --- /dev/null +++ b/dnn/src/common/macro_helper_epilogue.h @@ -0,0 +1,19 @@ +/** + * \file dnn/src/common/macro_helper_epilogue.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#ifndef MAKE_STR +#error "macro_helper_epilogue.h must be used after macro_helper.h" +#endif + +#undef MAKE_STR +#undef MAKE_STR0 +#undef CONCAT_STR +#undef CONCAT_STR0 +#undef WITH_SIMD_SUFFIX diff --git a/dnn/src/common/mask_conv.cpp b/dnn/src/common/mask_conv.cpp new file mode 100644 index 00000000..ee2f1ff8 --- /dev/null +++ b/dnn/src/common/mask_conv.cpp @@ -0,0 +1,52 @@ +/** + * \file dnn/src/common/mask_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" + +using namespace megdnn; + +void MaskConvForward::deduce_dtype(DType src, DType filter, DType, DType& dst) { + check_or_deduce_dtype_fwd(src, filter, dst); +} + +void MaskConvForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& mask, + TensorLayout& dst) { + deduce_layout_fwd(src, filter, dst); + megdnn_assert(dst[2] == mask[0]); + megdnn_assert(dst[3] == mask[1]); +} + +MaskConvForward::CanonizedFilterMeta +MaskConvForward::check_exec(const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& mask, const TensorLayout& dst, + size_t workspace_in_bytes) { + auto ret = check_layout_fwd(src, filter, dst); + megdnn_assert(dst[2] == mask[0]); + megdnn_assert(dst[3] == mask[1]); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, filter, mask, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + return ret; +} + +void MaskPropagate::deduce_layout(const TensorLayout& src, TensorLayout& dst) { + size_t oh, ow; + auto p = param(); + infer_conv_shape2d(src[0], src[1], (p.kernel_h - 1) * p.dilate_h + 1, + (p.kernel_w - 1) * p.dilate_w + 1, p.stride_h, + p.stride_w, p.pad_h, p.pad_w, oh, ow); + dst = TensorLayout{{oh, ow}, src.dtype}; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/matrix_inverse.cpp b/dnn/src/common/matrix_inverse.cpp new file mode 100644 index 00000000..d90a25b2 --- /dev/null +++ b/dnn/src/common/matrix_inverse.cpp @@ -0,0 +1,62 @@ +/** + * \file dnn/src/common/matrix_inverse.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs/linalg.h" + +#include "src/common/utils.h" + +using namespace megdnn; + +void MatrixInverse::deduce_layout(const TensorLayout& src, TensorLayout& dst) { + canonize_params(src, nullptr, nullptr); + dst = src; +} + +size_t MatrixInverse::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) { + size_t batch, n; + canonize_params(src, &batch, &n); + megdnn_assert(src.eq_layout(dst), "src and dst unequal: %s vs %s", + src.to_string().c_str(), dst.to_string().c_str()); + return get_workspace_in_bytes(batch, n, src.dtype.size()); +} + +void MatrixInverse::canonize_params(const TensorLayout& layout, size_t* batch, + size_t* n) { + megdnn_assert(layout.is_contiguous() && layout.ndim >= 2 && + layout[layout.ndim - 2] == layout[layout.ndim - 1], + "invalid MatrixInverse layout: %s", + layout.to_string().c_str()); + megdnn_assert( + MEGDNN_FLOAT16_SELECT(layout.dtype == dtype::Float16(), false) || + layout.dtype == dtype::Float32(), + "MatrixInverse only supports f16 & f32"); + if (batch) { + *batch = 1; + for (size_t i = 0; i < layout.ndim - 2; ++i) { + *batch *= layout[i]; + } + } + if (n) { + *n = layout[layout.ndim - 1]; + } +} + +void MatrixInverse::check_exec(const TensorLayout& src, const TensorLayout& dst, + _megdnn_workspace workspace, size_t* batch, + size_t* n) { + canonize_params(src, batch, n); + megdnn_assert(src.eq_layout(dst), "src and dst unequal: %s vs %s", + src.to_string().c_str(), dst.to_string().c_str()); + megdnn_assert(workspace.size >= + get_workspace_in_bytes(*batch, *n, src.dtype.size())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/matrix_mul.cpp b/dnn/src/common/matrix_mul.cpp new file mode 100644 index 00000000..f96c2b16 --- /dev/null +++ b/dnn/src/common/matrix_mul.cpp @@ -0,0 +1,196 @@ +/** + * \file dnn/src/common/matrix_mul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void MatrixMulForward::deduce_dtype(DType A, DType B, DType& C) { + // Expect that the user specifies output dtype (C), we then do sanity + // check on the dtype supplied by the user. C_dtype and C_dtype2 are the + // expected dtypes. If the user does not specify an output dtype by setting + // C = {}, we deduce one (C_dtype) and return it to the user. + DType C_candi, C_candi2; + if (A.category() == DTypeCategory::FLOAT) { + C_candi = A; + } else if (A.enumv() == DTypeEnum::Int8) { + C_candi = dtype::Int32(); + C_candi2 = dtype::Int16(); + } else if (A.enumv() == DTypeEnum::Int16) { + C_candi = dtype::Int32(); + } else if (A.enumv() == DTypeEnum::QuantizedS8) { + C_candi = dtype::QuantizedS32(mul_scale(A, B)); + } else if (A.enumv() == DTypeEnum::Quantized8Asymm) { + C_candi = dtype::QuantizedS32(mul_scale(A, B)); + } else if (A.enumv() == DTypeEnum::Quantized4Asymm) { + C_candi = dtype::QuantizedS32(mul_scale(A, B)); + } + if (!C.valid()) { + C = C_candi; + } + megdnn_assert(C.valid() && (C == C_candi || C == C_candi2), + "unsupported MatMul(%s, %s) -> %s", A.name(), B.name(), + C.name()); +} + +void MatrixMulForward::deduce_layout(const TensorLayout& A, + const TensorLayout& B, TensorLayout& C) { + megdnn_assert(A.dtype.enumv() == B.dtype.enumv(), + "matmul input should be of same dtype, got %s and %s", + A.dtype.name(), B.dtype.name()); + deduce_dtype(A.dtype, B.dtype, C.dtype); + size_t A0, A1, B0, B1; + if (param().format == param::MatrixMul::Format::DEFAULT) { + megdnn_assert(A.ndim == 2 && B.ndim == 2, + "matmul requires input to be 2-dimensional; get: %s %s", + A.TensorShape::to_string().c_str(), + B.TensorShape::to_string().c_str()); + A0 = A.shape[0]; + A1 = A.shape[1]; + B0 = B.shape[0]; + B1 = B.shape[1]; + if (m_param.transposeA) + std::swap(A0, A1); + if (m_param.transposeB) + std::swap(B0, B1); + megdnn_assert(A1 == B0, + "shape mismatch in matmal: (transposed) A is (%zu,%zu), " + "(transposed) B is (%zu,%zu)", + A0, A1, B0, B1); + C = TensorLayout(TensorShape({A0, B1}), C.dtype); + } else { + auto do_deduce = [&](size_t pack_size) { + megdnn_assert( + A.ndim == 4 && B.ndim == 3, + "matmul requires input dimension to be A(4), B(3); get: %s %s", + A.TensorShape::to_string().c_str(), + B.TensorShape::to_string().c_str()); + A0 = A.shape[0]; + A1 = A.shape[1]; + B0 = B.shape[0]; + B1 = B.shape[1]; + if (m_param.transposeA) + std::swap(A0, A1); + if (m_param.transposeB) + std::swap(B0, B1); + megdnn_assert( + A1 == B0, + "shape mismatch in matmal: (transposed) A is (%zu,%zu,4,4), " + "(transposed) B is (%zu,%zu,4)", + A0, A1, B0, B1); + C = TensorLayout(TensorShape({A0, B1, pack_size}), C.dtype); + }; + do_deduce(pack_size(param().format)); + } +} + +void MatrixMulForward::check_exec(const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C, + size_t workspace_in_bytes) { + auto errmsg = [&]() { + std::string msg; + msg.append(megdnn_mangle("A=")); + msg.append(A.to_string()); + msg.append(megdnn_mangle(", B=")); + msg.append(B.to_string()); + msg.append(megdnn_mangle(", C=")); + msg.append(C.to_string()); + msg.append(megdnn_mangle(", transposeA=")); + msg.append(std::to_string(param().transposeA)); + msg.append(megdnn_mangle(", transposeB=")); + msg.append(std::to_string(param().transposeB)); + return msg; + }; + MEGDNN_MARK_USED_VAR(errmsg); + if (param().format == param::MatrixMul::Format::DEFAULT) { + megdnn_assert_eq_size_t(A.ndim, 2_z); + megdnn_assert_eq_size_t(B.ndim, 2_z); + megdnn_assert_eq_size_t(C.ndim, 2_z); + + megdnn_assert(A.stride[1] == 1); + megdnn_assert(A.stride[0] >= static_cast(A.shape[1])); + megdnn_assert(B.stride[1] == 1); + megdnn_assert(B.stride[0] >= static_cast(B.shape[1])); + megdnn_assert(C.stride[1] == 1); + megdnn_assert(C.stride[0] >= static_cast(C.shape[1])); + size_t A0, A1, B0, B1, C0, C1; + A0 = A.shape[0]; + A1 = A.shape[1]; + B0 = B.shape[0]; + B1 = B.shape[1]; + C0 = C.shape[0]; + C1 = C.shape[1]; + if (m_param.transposeA) + std::swap(A0, A1); + if (m_param.transposeB) + std::swap(B0, B1); + megdnn_assert(A0 == C0, "%s", errmsg().c_str()); + megdnn_assert(B1 == C1, "%s", errmsg().c_str()); + megdnn_assert(A1 == B0, "%s", errmsg().c_str()); + } else { + megdnn_assert_eq_size_t(A.ndim, 4_z); + megdnn_assert_eq_size_t(B.ndim, 3_z); + megdnn_assert_eq_size_t(C.ndim, 3_z); + + megdnn_assert_contiguous(A); + megdnn_assert_contiguous(B); + megdnn_assert_contiguous(C); + size_t A0, A1, B0, B1, C0, C1; + A0 = A.shape[0]; + A1 = A.shape[1]; + B0 = B.shape[0]; + B1 = B.shape[1]; + C0 = C.shape[0]; + C1 = C.shape[1]; + if (m_param.transposeA) + std::swap(A0, A1); + if (m_param.transposeB) + std::swap(B0, B1); + megdnn_assert(A0 == C0, "%s", errmsg().c_str()); + megdnn_assert(B1 == C1, "%s", errmsg().c_str()); + megdnn_assert(A1 == B0, "%s", errmsg().c_str()); + } + + megdnn_assert(A.dtype.enumv() == B.dtype.enumv()); + if (A.dtype.category() == DTypeCategory::FLOAT) { + megdnn_assert(A.dtype == C.dtype); + } else if (A.dtype == dtype::Int8()) { + megdnn_assert(C.dtype == dtype::Int16() || C.dtype == dtype::Int32()); + } else if (A.dtype.enumv() == DTypeEnum::QuantizedS8 || + A.dtype.enumv() == DTypeEnum::Quantized8Asymm || + A.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + megdnn_assert(C.dtype.enumv() == DTypeEnum::QuantizedS32); + } + megdnn_assert(param().compute_mode != + Param::ComputeMode::FLOAT32 MEGDNN_INC_FLOAT16( + || A.dtype == dtype::Float16()), + "ComputeMode::FLOAT32 is only available for Float16 " + "input / output."); + auto required_workspace_in_bytes = get_workspace_in_bytes(A, B, C); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +size_t MatrixMulForward::pack_size(const Param::Format format) { + switch (format) { + case Param::Format::DEFAULT: + return 1; + case Param::Format::MK4: + return 4; + case Param::Format::MK8: + return 8; + default: + megdnn_throw(megdnn_mangle("Unknown matmul format.")); + } +} + +} // namespace megdnn + // vim: syntax=cpp.doxygen diff --git a/dnn/src/common/max_tensor_diff.cpp b/dnn/src/common/max_tensor_diff.cpp new file mode 100644 index 00000000..47e765fa --- /dev/null +++ b/dnn/src/common/max_tensor_diff.cpp @@ -0,0 +1,39 @@ +/** + * \file dnn/src/common/max_tensor_diff.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" +#include "megdnn/tensor_format.h" +#include "src/common/utils.h" + +using namespace megdnn; + +void megdnn::MaxTensorDiff::check_exec(const TensorLayout& layout1, + const TensorLayout& layout2, + size_t workspace_in_bytes) { + megdnn_assert(layout1.eq_layout(layout2), "layout1: %s, layout2: %s", + layout1.to_string().c_str(), layout2.to_string().c_str()); + if (Image2DPack4TensorFormat::is_valid_image(layout1)) { + megdnn_assert(layout1.is_contiguous() && layout1.ndim == 2 && + layout1.shape[0] && layout1.eq_layout(layout2), + "layout1: %s, layout2: %s", layout1.to_string().c_str(), + layout2.to_string().c_str()); + } else { + megdnn_assert(layout1.is_contiguous() && + (layout1.ndim == 1 || layout1.ndim == 2) && + layout1.shape[0] && layout1.eq_layout(layout2), + "layout1: %s, layout2: %s", layout1.to_string().c_str(), + layout2.to_string().c_str()); + } + auto required_workspace_in_bytes = get_workspace_in_bytes(layout1, layout2); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/common/computing_context.cpp b/dnn/src/common/megcore/common/computing_context.cpp new file mode 100644 index 00000000..89a129c1 --- /dev/null +++ b/dnn/src/common/megcore/common/computing_context.cpp @@ -0,0 +1,43 @@ +/** + * \file dnn/src/common/megcore/common/computing_context.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" + +#include "./computing_context.hpp" +#include "../cpu/default_computing_context.hpp" +#if MEGDNN_WITH_CUDA +#include "src/cuda/megcore/cuda_computing_context.hpp" +#endif + + +using namespace megcore; +using namespace megdnn; + +std::unique_ptr ComputingContext::make( + megcoreDeviceHandle_t dev_handle, unsigned int flags) +{ + megcorePlatform_t platform; + megcoreGetPlatform(dev_handle, &platform); + switch (platform) { + case megcorePlatformCPU: + return make_unique(dev_handle, flags); +#if MEGDNN_WITH_CUDA + case megcorePlatformCUDA: + return make_unique(dev_handle, flags); +#endif + default: + megdnn_throw("bad platform"); + } +} + +ComputingContext::~ComputingContext() noexcept = default; + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/common/computing_context.hpp b/dnn/src/common/megcore/common/computing_context.hpp new file mode 100644 index 00000000..cab1e52e --- /dev/null +++ b/dnn/src/common/megcore/common/computing_context.hpp @@ -0,0 +1,52 @@ +/** + * \file dnn/src/common/megcore/common/computing_context.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "./device_context.hpp" + +namespace megcore { + +class ComputingContext { + public: + static std::unique_ptr make( + megcoreDeviceHandle_t dev_handle, unsigned int flags); + + virtual ~ComputingContext() noexcept; + + megcoreDeviceHandle_t dev_handle() const noexcept { + return dev_handle_; + } + + unsigned int flags() const noexcept { + return flags_; + } + + virtual void memcpy(void *dst, const void *src, + size_t size_in_bytes, + megcoreMemcpyKind_t kind) = 0; + virtual void memset(void *dst, int value, size_t size_in_bytes) = 0; + virtual void synchronize() = 0; + + protected: + ComputingContext(megcoreDeviceHandle_t dev_handle, unsigned int flags): + dev_handle_{dev_handle}, + flags_{flags} + {} + + private: + megcoreDeviceHandle_t dev_handle_; + unsigned int flags_; +}; + +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/common/device_context.cpp b/dnn/src/common/megcore/common/device_context.cpp new file mode 100644 index 00000000..f66da9a7 --- /dev/null +++ b/dnn/src/common/megcore/common/device_context.cpp @@ -0,0 +1,41 @@ +/** + * \file dnn/src/common/megcore/common/device_context.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./device_context.hpp" + +#include "src/common/utils.h" +#include "../cpu/default_device_context.hpp" +#if MEGDNN_WITH_CUDA +#include "src/cuda/megcore/cuda_device_context.hpp" +#endif + + +using namespace megcore; +using namespace megdnn; + +std::unique_ptr DeviceContext::make(megcorePlatform_t platform, + int deviceID, unsigned int flags) +{ + switch (platform) { + case megcorePlatformCPU: + return make_unique(deviceID, flags); +#if MEGDNN_WITH_CUDA + case megcorePlatformCUDA: + return make_unique(deviceID, flags); +#endif + default: + megdnn_throw("bad platform"); + } +} + +DeviceContext::~DeviceContext() noexcept = default; + + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/common/device_context.hpp b/dnn/src/common/megcore/common/device_context.hpp new file mode 100644 index 00000000..765132be --- /dev/null +++ b/dnn/src/common/megcore/common/device_context.hpp @@ -0,0 +1,61 @@ +/** + * \file dnn/src/common/megcore/common/device_context.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megcore.h" + +#include + +namespace megcore { + +class DeviceContext { + public: + static std::unique_ptr make(megcorePlatform_t platform, + int deviceID, unsigned int flags); + + virtual ~DeviceContext() noexcept; + + megcorePlatform_t platform() const noexcept { + return platform_; + } + + int device_id() const noexcept { + return device_id_; + } + + unsigned int flags() const noexcept { + return flags_; + } + + virtual size_t mem_alignment_in_bytes() const noexcept = 0; + + virtual void activate() = 0; + virtual void *malloc(size_t size_in_bytes) = 0; + virtual void free(void *ptr) = 0; + + protected: + DeviceContext(megcorePlatform_t platform, + int device_id, unsigned int flags): + platform_(platform), + device_id_(device_id), + flags_(flags) + { + } + + private: + megcorePlatform_t platform_; + int device_id_; + unsigned int flags_; +}; + +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/cpu/api.cpp b/dnn/src/common/megcore/cpu/api.cpp new file mode 100644 index 00000000..5cc92e45 --- /dev/null +++ b/dnn/src/common/megcore/cpu/api.cpp @@ -0,0 +1,49 @@ +/** + * \file dnn/src/common/megcore/cpu/api.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megcore.h" +#include "src/common/utils.h" + +#include "./default_computing_context.hpp" +#include "../common/computing_context.hpp" +#include "../public_api/computing.hpp" + +using namespace megcore; + +CPUDispatcher::~CPUDispatcher() noexcept = default; + +megcoreStatus_t megcoreCreateComputingHandleWithCPUDispatcher( + megcoreComputingHandle_t *compHandle, + megcoreDeviceHandle_t devHandle, + const std::shared_ptr& dispatcher, + unsigned int flags) { + auto content = megdnn::make_unique< + megcore::cpu::DefaultComputingContext>(devHandle, flags); + auto &H = *compHandle; + content->set_dispatcher(dispatcher); + H = new megcoreComputingContext; + H->content = std::move(content); + return megcoreSuccess; +} + +CPUDispatcher* megcoreGetCPUDispatcher(megcoreComputingHandle_t handle) { + auto &&H = handle; + megdnn_assert(H); + // Check device handle. + megcoreDeviceHandle_t dev_handle = H->content->dev_handle(); + megcorePlatform_t platform; + megcoreGetPlatform(dev_handle, &platform); + megdnn_assert(platform &megcorePlatformCPU); + auto context = static_cast( + H->content.get()); + return context->get_dispatcher(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/cpu/default_computing_context.cpp b/dnn/src/common/megcore/cpu/default_computing_context.cpp new file mode 100644 index 00000000..b49fc167 --- /dev/null +++ b/dnn/src/common/megcore/cpu/default_computing_context.cpp @@ -0,0 +1,66 @@ +/** + * \file dnn/src/common/megcore/cpu/default_computing_context.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" +#include "./default_computing_context.hpp" + +#include + +namespace { +class InplaceDispatcher final : public MegcoreCPUDispatcher { +public: + void dispatch(Task&& task) override { task(); } + + void dispatch(MultiThreadingTask&& task, size_t parallelism) override { + for (size_t i = 0; i < parallelism; i++) { + task(i, 0); + } + } + + void sync() override {} + + size_t nr_threads() override { return 1; }; +}; +} // namespace + +using namespace megcore; +using namespace cpu; + +DefaultComputingContext::DefaultComputingContext( + megcoreDeviceHandle_t dev_handle, unsigned int flags): + ComputingContext(dev_handle, flags), + m_dispatcher{megdnn::make_unique()} +{ + megcorePlatform_t platform; + megcoreGetPlatform(dev_handle, &platform); + megdnn_assert(platform & megcorePlatformCPU); +} + +DefaultComputingContext::~DefaultComputingContext() noexcept = default; + +void DefaultComputingContext::memcpy(void *dst, const void *src, + size_t size_in_bytes, + megcoreMemcpyKind_t /* kind */) +{ + ::memcpy(dst, src, size_in_bytes); +} + +void DefaultComputingContext::memset(void *dst, int value, size_t size_in_bytes) +{ + ::memset(dst, value, size_in_bytes); +} + +void DefaultComputingContext::synchronize() +{ + m_dispatcher->sync(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/cpu/default_computing_context.hpp b/dnn/src/common/megcore/cpu/default_computing_context.hpp new file mode 100644 index 00000000..9a3e507e --- /dev/null +++ b/dnn/src/common/megcore/cpu/default_computing_context.hpp @@ -0,0 +1,49 @@ +/** + * \file dnn/src/common/megcore/cpu/default_computing_context.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "../common/computing_context.hpp" + +namespace megcore { +namespace cpu { + +/** + * \brief A thin wrapper over memcpy and memset. + * + * No magic thing happens here. + */ +class DefaultComputingContext: public ComputingContext { + std::shared_ptr m_dispatcher; + + public: + DefaultComputingContext(megcoreDeviceHandle_t dev_handle, + unsigned int flags); + ~DefaultComputingContext() noexcept; + + void set_dispatcher( + const std::shared_ptr& dispatcher) { + m_dispatcher = dispatcher; + } + + MegcoreCPUDispatcher* get_dispatcher() const { + return m_dispatcher.get(); + } + + void memcpy(void *dst, const void *src, size_t size_in_bytes, + megcoreMemcpyKind_t kind) override; + void memset(void *dst, int value, size_t size_in_bytes) override; + void synchronize() override; +}; + +} // namespace cpu +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/cpu/default_device_context.cpp b/dnn/src/common/megcore/cpu/default_device_context.cpp new file mode 100644 index 00000000..b843849b --- /dev/null +++ b/dnn/src/common/megcore/cpu/default_device_context.cpp @@ -0,0 +1,44 @@ +/** + * \file dnn/src/common/megcore/cpu/default_device_context.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" + +#include "./default_device_context.hpp" +#include + +using namespace megcore; +using namespace megcore::cpu; +using namespace megdnn; + +DefaultDeviceContext::DefaultDeviceContext(int device_id, unsigned int flags): + DeviceContext(megcorePlatformCPU, device_id, flags) +{ + megdnn_assert(device_id == -1); +} + +DefaultDeviceContext::~DefaultDeviceContext() noexcept = default; + +size_t DefaultDeviceContext::mem_alignment_in_bytes() const noexcept { + return 1; +} + +void DefaultDeviceContext::activate() noexcept { +} + +void *DefaultDeviceContext::malloc(size_t size_in_bytes) { + return new uint8_t[size_in_bytes]; +} + +void DefaultDeviceContext::free(void *ptr) { + delete []static_cast(ptr); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/cpu/default_device_context.hpp b/dnn/src/common/megcore/cpu/default_device_context.hpp new file mode 100644 index 00000000..e425ec33 --- /dev/null +++ b/dnn/src/common/megcore/cpu/default_device_context.hpp @@ -0,0 +1,38 @@ +/** + * \file dnn/src/common/megcore/cpu/default_device_context.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "../common/device_context.hpp" + +namespace megcore { +namespace cpu { + +/** + * \brief A thin wrapper class over malloc and free. + * + * No magic thing happens here. + */ +class DefaultDeviceContext: public DeviceContext { + public: + DefaultDeviceContext(int device_id, unsigned int flags); + ~DefaultDeviceContext() noexcept; + + size_t mem_alignment_in_bytes() const noexcept override; + + void activate() noexcept override; + void *malloc(size_t size_in_bytes) override; + void free(void *ptr) override; +}; + +} // namespace cpu +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/public_api/computing.cpp b/dnn/src/common/megcore/public_api/computing.cpp new file mode 100644 index 00000000..4ff00170 --- /dev/null +++ b/dnn/src/common/megcore/public_api/computing.cpp @@ -0,0 +1,82 @@ +/** + * \file dnn/src/common/megcore/public_api/computing.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megcore.h" +#include "src/common/utils.h" + +#include "./computing.hpp" +#include "../common/computing_context.hpp" + +using namespace megcore; + +megcoreStatus_t megcoreCreateComputingHandle( + megcoreComputingHandle_t *compHandle, + megcoreDeviceHandle_t devHandle, + unsigned int flags) +{ + auto ctx = ComputingContext::make(devHandle, flags); + auto &H = *compHandle; + H = new megcoreComputingContext; + H->content = std::move(ctx); + return megcoreSuccess; +} + +megcoreStatus_t megcoreDestroyComputingHandle( + megcoreComputingHandle_t handle) +{ + megdnn_assert(handle); + delete handle; + return megcoreSuccess; +} + +megcoreStatus_t megcoreGetDeviceHandle( + megcoreComputingHandle_t compHandle, + megcoreDeviceHandle_t *devHandle) +{ + megdnn_assert(compHandle); + *devHandle = compHandle->content->dev_handle(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreGetComputingFlags( + megcoreComputingHandle_t handle, + unsigned int *flags) +{ + megdnn_assert(handle); + *flags = handle->content->flags(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreMemcpy(megcoreComputingHandle_t handle, + void *dst, const void *src, size_t sizeInBytes, + megcoreMemcpyKind_t kind) +{ + megdnn_assert(handle); + handle->content->memcpy(dst, src, sizeInBytes, kind); + return megcoreSuccess; +} + +megcoreStatus_t megcoreMemset(megcoreComputingHandle_t handle, + void *dst, int value, size_t sizeInBytes) +{ + megdnn_assert(handle); + handle->content->memset(dst, value, sizeInBytes); + return megcoreSuccess; +} + +megcoreStatus_t megcoreSynchronize(megcoreComputingHandle_t handle) +{ + megdnn_assert(handle); + handle->content->synchronize(); + return megcoreSuccess; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/public_api/computing.hpp b/dnn/src/common/megcore/public_api/computing.hpp new file mode 100644 index 00000000..a264723c --- /dev/null +++ b/dnn/src/common/megcore/public_api/computing.hpp @@ -0,0 +1,21 @@ +/** + * \file dnn/src/common/megcore/public_api/computing.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megcore.h" +#include "../common/computing_context.hpp" +#include + +struct megcoreComputingContext { + std::unique_ptr content; +}; + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/public_api/device.cpp b/dnn/src/common/megcore/public_api/device.cpp new file mode 100644 index 00000000..96dfaa76 --- /dev/null +++ b/dnn/src/common/megcore/public_api/device.cpp @@ -0,0 +1,92 @@ +/** + * \file dnn/src/common/megcore/public_api/device.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megcore.h" +#include "src/common/utils.h" + +#include "./device.hpp" +#include "../common/device_context.hpp" + +using namespace megcore; + +megcoreStatus_t megcoreCreateDeviceHandle( + megcoreDeviceHandle_t *handle, + megcorePlatform_t platform, int deviceID, unsigned int flags) +{ + auto ctx = DeviceContext::make(platform, deviceID, flags); + auto &H = *handle; + H = new megcoreDeviceContext; + H->content = std::move(ctx); + return megcoreSuccess; +} + +megcoreStatus_t megcoreDestroyDeviceHandle( + megcoreDeviceHandle_t handle) +{ + megdnn_assert(handle); + delete handle; + return megcoreSuccess; +} + +megcoreStatus_t megcoreGetPlatform(megcoreDeviceHandle_t handle, + megcorePlatform_t *platform) +{ + megdnn_assert(handle); + *platform = handle->content->platform(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreGetDeviceID(megcoreDeviceHandle_t handle, + int *deviceID) +{ + megdnn_assert(handle); + *deviceID = handle->content->device_id(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreGetDeviceFlags(megcoreDeviceHandle_t handle, + unsigned int *flags) +{ + megdnn_assert(handle); + *flags = handle->content->flags(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreGetMemAlignment(megcoreDeviceHandle_t handle, + size_t *memAlignmentInBytes) +{ + megdnn_assert(handle); + *memAlignmentInBytes = handle->content->mem_alignment_in_bytes(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle) +{ + megdnn_assert(handle); + handle->content->activate(); + return megcoreSuccess; +} + +megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, + void **devPtr, size_t sizeInBytes) +{ + megdnn_assert(handle); + *devPtr = handle->content->malloc(sizeInBytes); + return megcoreSuccess; +} + +megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle, void *devPtr) +{ + megdnn_assert(handle); + handle->content->free(devPtr); + return megcoreSuccess; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/public_api/device.hpp b/dnn/src/common/megcore/public_api/device.hpp new file mode 100644 index 00000000..61fb5a5a --- /dev/null +++ b/dnn/src/common/megcore/public_api/device.hpp @@ -0,0 +1,20 @@ +/** + * \file dnn/src/common/megcore/public_api/device.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megcore.h" +#include "../common/device_context.hpp" +#include + +struct megcoreDeviceContext { + std::unique_ptr content; +}; + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/megcore/public_api/misc.cpp b/dnn/src/common/megcore/public_api/misc.cpp new file mode 100644 index 00000000..50da099d --- /dev/null +++ b/dnn/src/common/megcore/public_api/misc.cpp @@ -0,0 +1,30 @@ +/** + * \file dnn/src/common/megcore/public_api/misc.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megcore.h" +#include "src/common/utils.h" + +const char *megcoreGetErrorName(megcoreStatus_t status) +{ +#define CASE(x) case x: return megdnn_mangle(#x) + switch (status) { + CASE(megcoreSuccess); + CASE(megcoreErrorMemoryAllocation); + CASE(megcoreErrorInvalidArgument); + CASE(megcoreErrorInvalidDeviceHandle); + CASE(megcoreErrorInternalError); + CASE(megcoreErrorInvalidComputingHandle); + default: + return megdnn_mangle(""); + } +#undef CASE +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/mesh_indexing.cpp b/dnn/src/common/mesh_indexing.cpp new file mode 100644 index 00000000..6c1cd73e --- /dev/null +++ b/dnn/src/common/mesh_indexing.cpp @@ -0,0 +1,86 @@ +/** + * \file dnn/src/common/mesh_indexing.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" +#include "src/common/utils.h" + +namespace megdnn { + +/* ============================== MeshIndexing ============================= */ + +void MeshBase::check_exec(const TensorLayout& origin, + const TensorLayout& indexed, const IndexDesc& desc) { + megdnn_assert(origin.dtype == indexed.dtype); + megdnn_assert(origin.ndim == indexed.ndim); + for (auto&& index : desc) { + megdnn_assert(index.vec.layout.dtype == dtype::Int32()); + } +} + +void NormalMeshBase::check_exec(const TensorLayout& src, + const TensorLayout& dst, + const IndexDesc& desc) { + MeshBase::check_exec(src, dst, desc); + for (auto&& index : desc) { + size_t ndim = index.vec.layout.ndim; + megdnn_assert(ndim == 1, "index must be 1-dim vector, while dim %zu", + ndim); + megdnn_assert(dst.shape[index.axis] == index.vec.layout[0]); + } +} + +void BatchedMeshBase::check_exec(const TensorLayout& src, + const TensorLayout& dst, + const IndexDesc& desc) { + MeshBase::check_exec(src, dst, desc); + megdnn_assert(src[0] == dst[0], "batch mismatch, src %zu, dst %zu", src[0], + dst[0]); + for (auto&& index : desc) { + size_t ndim = index.vec.layout.ndim; + megdnn_assert(ndim == 2, "index must be a 2-dim matrix, while ndim %zu", + ndim); + megdnn_assert(dst[0] == index.vec.layout[0] && + dst[index.axis] == index.vec.layout[1], + "require each index shape equals (%zu, %zu), but got " + "(%zu, %zu)", + dst[0], dst[index.axis], index.vec.layout[0], + index.vec.layout[1]); + megdnn_assert(index.axis != 0, + "index axis should be 0-th dim when executing " + "BatchedMeshIndexing"); + } +} + +void MeshIndexing::deduce_layout(const TensorLayout& inp, + const IndexDescLayoutOnly& layouts, + TensorLayout& out_layout) { + out_layout = inp; + for (auto&& index : layouts) { + megdnn_assert(index.layout.ndim == 1, + "mesh indexing require index being 1-dim vector"); + out_layout[index.axis] = index.layout[0]; + } + out_layout.init_contiguous_stride(); +} + +void BatchedMeshIndexing::deduce_layout(const TensorLayout& inp, + const IndexDescLayoutOnly& layouts, + TensorLayout& out_layout) { + out_layout = inp; + for (auto&& index : layouts) { + megdnn_assert(index.layout.ndim == 2, + "batch mesh indexing require index being 2-dim matrix"); + out_layout[index.axis] = index.layout[1]; + } + out_layout.init_contiguous_stride(); +} + +} // namespace megdnn diff --git a/dnn/src/common/metahelper.h b/dnn/src/common/metahelper.h new file mode 100644 index 00000000..536d5046 --- /dev/null +++ b/dnn/src/common/metahelper.h @@ -0,0 +1,28 @@ +/** + * \file dnn/src/common/metahelper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +namespace megdnn { +/*! + * \brief base class for non-copyable objects + */ +class NonCopyableObj { + NonCopyableObj(const NonCopyableObj&) = delete; + NonCopyableObj& operator=(const NonCopyableObj&) = delete; + +public: + NonCopyableObj() = default; +}; + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/opr_delegate.cpp b/dnn/src/common/opr_delegate.cpp new file mode 100644 index 00000000..2cefdb46 --- /dev/null +++ b/dnn/src/common/opr_delegate.cpp @@ -0,0 +1,34 @@ +/** + * \file dnn/src/common/opr_delegate.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/opr_delegate.h" + +using namespace megdnn; + +const std::shared_ptr& megdnn::inplace_cpu_handle() { + auto make = []() { + megcoreDeviceHandle_t dev_handle; + megcoreCreateDeviceHandle(&dev_handle, megcorePlatformCPU); + megcoreComputingHandle_t comp_handle; + megcoreCreateComputingHandle(&comp_handle, dev_handle); + auto destructor = [=]() { + megcoreDestroyComputingHandle(comp_handle); + megcoreDestroyDeviceHandle(dev_handle); + }; + std::shared_ptr handle = Handle::make(comp_handle); + handle->set_destructor(destructor); + return handle; + }; + static std::shared_ptr handle = make(); + return handle; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/opr_delegate.h b/dnn/src/common/opr_delegate.h new file mode 100644 index 00000000..74491d92 --- /dev/null +++ b/dnn/src/common/opr_delegate.h @@ -0,0 +1,76 @@ +/** + * \file dnn/src/common/opr_delegate.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/handle.h" +#include "megdnn/oprs/base.h" + +#include "src/common/utils.h" + +namespace megdnn { + +/*! + * \brief get a handle that dispatches to caller cpu thread + * + * Usually used for calling other opr impls from some opr impl. You probably + * want to use CpuOprDelegationStorage instead. + */ +const std::shared_ptr& inplace_cpu_handle(); + +/*! + * \brief storage for oprs on inplace CPU handle + * + * This class takes care of thread safety and destruction order. Usage example: + * + * MatrixMul* get_matmul() { + * static CpuOprDelegationStorage<> storage; + * return storage.get(); + * } + */ +template +class CpuOprDelegationStorage { + std::mutex m_mtx; + std::shared_ptr m_handle; + std::unique_ptr m_oprs[nr_opr]; + +public: + ~CpuOprDelegationStorage(); + + template + Opr* get(const typename Opr::Param& param = {}); +}; + +template +CpuOprDelegationStorage::~CpuOprDelegationStorage() = default; + +template +template +Opr* CpuOprDelegationStorage::get(const typename Opr::Param& param) { + static_assert(idx < nr_opr, "invalid idx"); + if (!m_oprs[idx]) { + MEGDNN_LOCK_GUARD(m_mtx); + if (!m_oprs[idx]) { + if (!m_handle) { + m_handle = inplace_cpu_handle(); + } + auto opr = m_handle->create_operator(); + megdnn_assert(opr->is_thread_safe()); + opr->param() = param; + m_oprs[idx] = std::move(opr); + } + } + return static_cast(m_oprs[idx].get()); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/param_pack.cpp b/dnn/src/common/param_pack.cpp new file mode 100644 index 00000000..e54093b7 --- /dev/null +++ b/dnn/src/common/param_pack.cpp @@ -0,0 +1,75 @@ +/** + * \file dnn/src/common/param_pack.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" +#include "src/common/utils.h" + +using namespace megdnn; + +void ParamPackConcatSplitBase::check_exec(const TensorLayout& concated, + const TensorLayout& table, + const TensorLayout& parts) { + megdnn_assert(table.dtype == dtype::Int32{}, "bad dtype: %s", + table.dtype.name()); + megdnn_assert(concated.ndim == 1 && table.ndim == 1 && parts.ndim == 1 && + concated.stride[0] == 1 && table.stride[0] == 1 && + parts.stride[0] == 1, + "bad layout: concated=%s table=%s parts=%s", + concated.to_string().c_str(), table.to_string().c_str(), + parts.to_string().c_str()); + megdnn_assert(table.shape[0] == concated.shape[0] * 2, + "concated=%zu table=%zu", concated.shape[0], table.shape[0]); +} + +std::vector ParamPackConcatSplitBase::gen_table( + const TensorShapeArray& shapes, size_t alignment, size_t dtype_size) { + megdnn_assert(alignment && (alignment & (alignment - 1)) == 0, + "alignment must be power of 2: %zu", alignment); + if (alignment < dtype_size) + alignment = dtype_size; + + megdnn_assert(alignment % dtype_size == 0, + "alignment must be multiple of dtype size: %zu vs %zu", + alignment, dtype_size); + alignment /= dtype_size; + + auto get_aligned = [alignment](size_t v) { + auto mod = v & (alignment - 1); + return v + ((alignment - mod) & (alignment - 1)); + }; + + size_t offset = 0; + for (auto&& i : shapes) { + offset = get_aligned(offset) + i.total_nr_elems(); + } + + std::vector table(offset * 2); + auto outer_table = table.data(), inner_table = outer_table + offset; + + offset = 0; + for (size_t i = 0; i < shapes.size(); ++i) { + auto aligned = get_aligned(offset); + for (size_t j = offset; j < aligned; ++j) { + inner_table[j] = outer_table[j] = -1; + } + offset = aligned; + auto cur_size = shapes[i].total_nr_elems(); + for (size_t j = 0; j < cur_size; ++j) { + outer_table[offset + j] = i; + inner_table[offset + j] = j; + } + offset += cur_size; + } + megdnn_assert(offset * 2 == table.size()); + return table; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/pooling.cpp b/dnn/src/common/pooling.cpp new file mode 100644 index 00000000..756507cd --- /dev/null +++ b/dnn/src/common/pooling.cpp @@ -0,0 +1,153 @@ +/** + * \file dnn/src/common/pooling.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void PoolingBase::deduce_layout_fwd(const TensorLayout& src, + TensorLayout& dst) { + auto errmsg = + megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param().stride_h) + + ", " + megdnn_mangle("stride_w=") + + std::to_string(param().stride_w) + ", " + + megdnn_mangle("window_h=") + std::to_string(param().window_h) + + ", " + megdnn_mangle("window_w=") + + std::to_string(param().window_w) + ", " + megdnn_mangle("is_max=") + + std::to_string(param().mode == Mode::MAX) + ", " + + megdnn_mangle("is_nhwc=") + + std::to_string(param().format == Param::Format::NHWC) + ", " + + megdnn_mangle("is_nhwcd4=") + + std::to_string(param().format == Param::Format::NHWCD4); + auto errmsg_c = errmsg.c_str(); + + MEGDNN_MARK_USED_VAR(errmsg_c); + megdnn_assert_contiguous(src); + size_t spatial_pos, c_pos, batch_pos = 0; + if (param().format == Param::Format::NCHW) { + megdnn_assert(src.ndim == 4_z, "%s", errmsg_c); + + spatial_pos = 2; + c_pos = 1; + } else if (param().format == Param::Format::NHWC) { + megdnn_assert(src.ndim == 4_z, "%s", errmsg_c); + + spatial_pos = 1; + c_pos = 3; + } else if (param().format == Param::Format::NCHW4 || + param().format == Param::Format::NCHW88 || + param().format == Param::Format::NCHW32) { + megdnn_assert(src.ndim == 5_z, "%s", errmsg_c); + + spatial_pos = 2; + c_pos = 1; + } else if (param().format == Param::Format::CHWN4) { + spatial_pos = 1; + c_pos = 0; + batch_pos = 3; + } else { + megdnn_assert( + param().format == Param::Format::NHWCD4 && src.ndim == 5_z, + "%s", errmsg_c); + spatial_pos = 1; + c_pos = 2; + } + size_t n = src[batch_pos]; + size_t c = src[c_pos]; + size_t ih = src[spatial_pos]; + size_t iw = src[spatial_pos + 1]; + if (param().format == Param::Format::NHWCD4) { + c *= 4; + iw = src[spatial_pos + 2]; + } + if (param().format == Param::Format::NCHW4 || + param().format == Param::Format::CHWN4) { + c *= 4; + } + if (param().format == Param::Format::NCHW88) { + c *= 8; + } + if (param().format == Param::Format::NCHW32) { + c *= 32; + } + size_t oh, ow; + size_t fh = this->param().window_h; + size_t fw = this->param().window_w; + size_t sh = this->param().stride_h; + size_t sw = this->param().stride_w; + size_t ph = this->param().pad_h; + size_t pw = this->param().pad_w; + infer_conv_shape2d(ih, iw, fh, fw, sh, sw, ph, pw, oh, ow); + if (param().format == Param::Format::NCHW) { + dst = TensorLayout(TensorShape({n, c, oh, ow}), src.dtype); + } else if (param().format == Param::Format::NHWC) { + megdnn_assert(param().format == Param::Format::NHWC, + "invalid pooling format"); + dst = TensorLayout({n, oh, ow, c}, src.dtype, src.format); + } else if (param().format == Param::Format::NCHW4) { + dst = TensorLayout{{n, c / 4, oh, ow, 4}, src.dtype, src.format}; + } else if (param().format == Param::Format::NCHW88) { + dst = TensorLayout{{n, c / 8, oh, ow, 8}, src.dtype, src.format}; + } else if (param().format == Param::Format::NCHW32) { + dst = TensorLayout{{n, c / 32, oh, ow, 32}, src.dtype, src.format}; + } else if (param().format == Param::Format::CHWN4) { + dst = TensorLayout{{c / 4, oh, ow, n, 4}, src.dtype, src.format}; + } else { + megdnn_assert(param().format == Param::Format::NHWCD4, + "invalid pooling format"); + dst = TensorLayout{{n, oh, c / 4, ow, 4}, src.dtype, src.format}; + } +} + +void PoolingBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& dst) { + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + megdnn_assert(src.dtype == dst.dtype); + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT || + src.dtype == dtype::Int8() || + src.dtype.category() == DTypeCategory::QUANTIZED); +} + +void PoolingForward::deduce_layout(const TensorLayout& src, TensorLayout& dst) { + deduce_layout_fwd(src, dst); +} + +void PoolingForward::check_exec(const TensorLayout& src, + const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void PoolingBackward::check_exec(const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes) { + check_layout_fwd(src, dst); + megdnn_assert_eq_layout(src, grad); + megdnn_assert_eq_layout(dst, diff); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, dst, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_decl.inl b/dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_decl.inl new file mode 100644 index 00000000..cd0743ea --- /dev/null +++ b/dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_decl.inl @@ -0,0 +1,38 @@ +/** + * \file dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_decl.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// simd_macro/*_helper.h should be included before including this file. +// +// The following function would be declared in this file: +// +// void do_max_pooling_3x3_s2x2_float_MEGDNN_SIMD_NAME(const float *src, +// const float *filter, float *dst, +// size_t IH, size_t IW, size_t OH, size_t OW, +// size_t FH, size_t FW, size_t PH, size_t PW) +#include "src/common/macro_helper.h" +#include "src/common/utils.h" + +#include "megdnn/arch.h" + +namespace megdnn { + +#define FUNC_NAME CONCAT_STR(do_max_pooling_3x3_s2x2_float_, MEGDNN_SIMD_NAME) + +void FUNC_NAME(const float *src, float *dst, + size_t IH_, size_t IW_, size_t OH_, size_t OW_, size_t PH_, size_t PW_, + const WorkspaceBundle& ws) +MEGDNN_SIMD_ATTRIBUTE_TARGET; + +#undef FUNC_NAME + +} + +#include "src/common/macro_helper_epilogue.h" + diff --git a/dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_def.inl b/dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_def.inl new file mode 100644 index 00000000..e608e161 --- /dev/null +++ b/dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_def.inl @@ -0,0 +1,158 @@ +/** + * \file dnn/src/common/pooling/do_max_pooling_3x3_s2x2_float_def.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// simd_macro/*_helper.h should be included before including this file. +// +// The following function would be defined in this file: +// +// void do_max_pooling_3x3_s2x2_float_MEGDNN_SIMD_NAME(const float *src, +// float *dst, +// size_t IH_, size_t IW_, +// size_t OH_, size_t OW_, +// size_t PH_, size_t PW_); + +#include "src/common/utils.h" + +#include "src/common/macro_helper.h" +#include +#include + +namespace megdnn { + +#define FUNC_NAME CONCAT_STR(do_max_pooling_3x3_s2x2_float_, MEGDNN_SIMD_NAME) +MEGDNN_SIMD_ATTRIBUTE_TARGET +void FUNC_NAME(const float *src, float *dst, + size_t IH_, size_t IW_, size_t OH_, size_t OW_, size_t PH_, size_t PW_, + const WorkspaceBundle& ws) +{ + int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_; + // cache[i] stores the answer of the i-th line after + // pooling along the W dimension. + float* cache[3] = {static_cast(ws.get(0)), + static_cast(ws.get(1)), + static_cast(ws.get(2))}; + float* odd = static_cast(ws.get(3)); + float* even = static_cast(ws.get(4)); + int ih_next = 0; + // "good" area means we can use SIMD to accelerate. + auto get_good_area = [](int I, int /* O */, int P, int &O_from, int &O_to) { + // x*2 - P >= 0; 2x >= P; x >= P/2 + O_from = (P+1) / 2; + // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2 + O_to = (I+P-3) / 2 + 1; + // we must have I >= 2 to ensure O_from <= O_to + }; + int OW_from, OW_to; + get_good_area(IW, OW, PW, OW_from, OW_to); + auto process_cache = [&](int ih) MEGDNN_SIMD_LAMBDA_ATTRIBUTE_TARGET { + const float * __restrict sptr = src + ih*IW; + auto tmp = cache[2]; + cache[2] = cache[1]; + cache[1] = cache[0]; + cache[0] = tmp; + // cache 0 is used to store the current answer. + auto run_single = [&](int ow) { + int iw = ow*2 - PW; + float res = std::numeric_limits::lowest(); + if (iw+0 >= 0 && iw+0 < IW) { + res = std::max(res, sptr[iw+0]); + } + if (iw+1 >= 0 && iw+1 < IW) { + res = std::max(res, sptr[iw+1]); + } + if (iw+2 >= 0 && iw+2 < IW) { + res = std::max(res, sptr[iw+2]); + } + cache[0][ow] = res; + }; + // build odd/even + int iw = 0; + int odd_offset = 0, even_offset = 0; + + for (; iw+2*MEGDNN_SIMD_WIDTH <= IW; iw += 2*MEGDNN_SIMD_WIDTH) { + MEGDNN_SIMD_TYPE s0, s1, d0, d1; + s0 = MEGDNN_SIMD_LOADU(sptr + iw); + s1 = MEGDNN_SIMD_LOADU(sptr + iw + MEGDNN_SIMD_WIDTH); + MEGDNN_SIMD_UZP(s0, s1, d0, d1); + MEGDNN_SIMD_STOREU(even + even_offset, d0); + MEGDNN_SIMD_STOREU(odd + odd_offset, d1); + even_offset += MEGDNN_SIMD_WIDTH; + odd_offset += MEGDNN_SIMD_WIDTH; + } + for (; iw < IW; ++iw) { + if (iw & 1) + odd[odd_offset++] = sptr[iw]; + else + even[even_offset++] = sptr[iw]; + } + int ow = 0; + for (; ow < OW_from; ++ow) run_single(ow); + if (PW & 1) { + for (; ow+MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { + MEGDNN_SIMD_TYPE d, s0, s1, s2; + s0 = MEGDNN_SIMD_LOADU(odd + ow - (PW>>1) - 1); + s1 = MEGDNN_SIMD_LOADU(even + ow - (PW>>1)); + s2 = MEGDNN_SIMD_LOADU(odd + ow - (PW>>1)); + d = MEGDNN_SIMD_MAX(MEGDNN_SIMD_MAX(s0, s1), s2); + MEGDNN_SIMD_STOREU(cache[0] + ow, d); + } + } else { + for (; ow+MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { + MEGDNN_SIMD_TYPE d, s0, s1, s2; + s0 = MEGDNN_SIMD_LOADU(even + ow - (PW>>1)); + s1 = MEGDNN_SIMD_LOADU(odd + ow - (PW>>1)); + s2 = MEGDNN_SIMD_LOADU(even + ow - (PW>>1) + 1); + d = MEGDNN_SIMD_MAX(MEGDNN_SIMD_MAX(s0, s1), s2); + MEGDNN_SIMD_STOREU(cache[0] + ow, d); + } + } + for (; ow < OW; ++ow) run_single(ow); + }; + for (int oh = 0; oh < OH; ++oh) { + float * __restrict dptr = dst + oh*OW; + int ih_from = std::min(IH, std::max(0, oh*2 - PH)); + int ih_to = std::min(IH, std::max(0, oh*2 - PH + 3)); + while (ih_next < ih_to) { + process_cache(ih_next++); + } + if (ih_to - ih_from == 3) { + int ow = 0; + for (; ow+MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + MEGDNN_SIMD_TYPE d, s0, s1, s2; + s0 = MEGDNN_SIMD_LOADU(cache[0] + ow); + s1 = MEGDNN_SIMD_LOADU(cache[1] + ow); + s2 = MEGDNN_SIMD_LOADU(cache[2] + ow); + d = MEGDNN_SIMD_MAX(MEGDNN_SIMD_MAX(s0, s1), s2); + MEGDNN_SIMD_STOREU(dptr + ow, d); + } + for (; ow < OW; ++ow) { + dptr[ow] = std::max(std::max(cache[0][ow], cache[1][ow]), + cache[2][ow]); + } + } else { + std::memcpy(dptr, cache[0], sizeof(float) * OW); + for (int i = 1; i < ih_to - ih_from; ++i) { + int ow = 0; + for (; ow+MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + MEGDNN_SIMD_TYPE d, s; + s = MEGDNN_SIMD_LOADU(cache[i] + ow); + d = MEGDNN_SIMD_LOADU(dptr + ow); + d = MEGDNN_SIMD_MAX(d, s); + MEGDNN_SIMD_STOREU(dptr + ow, d); + } + for (; ow < OW; ++ow) { + dptr[ow] = std::max(dptr[ow], cache[i][ow]); + } + } + } + } +} + +} // namespace megdnn diff --git a/dnn/src/common/powc.cpp b/dnn/src/common/powc.cpp new file mode 100644 index 00000000..a698da2a --- /dev/null +++ b/dnn/src/common/powc.cpp @@ -0,0 +1,41 @@ +/** + * \file dnn/src/common/powc.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; + +void PowC::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { + megdnn_assert(src.layout.dtype == dst.layout.dtype && + src.layout.dtype.category() == DTypeCategory::FLOAT && + src.layout.eq_shape(dst.layout), + "invalid layout: %s vs %s", src.layout.to_string().c_str(), + dst.layout.to_string().c_str()); + int iv, *ivp = nullptr; + float fv, *fvp = nullptr; + float p = param().exp; + int pi = static_cast(std::round(p)); + if (std::abs(static_cast(pi) - p) < + std::numeric_limits::epsilon()) { + iv = pi; + ivp = &iv; + } else { + fv = p; + fvp = &fv; + } + do_exec(src, dst, fvp, ivp); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/reduce.cpp b/dnn/src/common/reduce.cpp new file mode 100644 index 00000000..99ec0bab --- /dev/null +++ b/dnn/src/common/reduce.cpp @@ -0,0 +1,105 @@ +/** + * \file dnn/src/common/reduce.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include +#include "src/common/utils.h" + +namespace { +using namespace megdnn; +using megdnn::Reduce; + +DType get_out_dtype(const Reduce::DataType data_type, const DType inp_dtype) { + if (data_type == Reduce::DataType::FLOAT_O16xC32) { +#if !MEGDNN_DISABLE_FLOAT16 + return dtype::Float16(); +#else + megdnn_assert_internal(0); +#endif + } + if (data_type == Reduce::DataType::FLOAT_O32xC32) { + return dtype::Float32(); + } + if (data_type == Reduce::DataType::QUINT_I8xO32) { + megdnn_assert(inp_dtype.enumv() == DTypeEnum::Quantized8Asymm); + return dtype::QuantizedS32( + inp_dtype.param().scale); + } + if (data_type == Reduce::DataType::QINT_I8xO32) { + megdnn_assert(inp_dtype.enumv() == DTypeEnum::QuantizedS8); + return dtype::QuantizedS32( + inp_dtype.param().scale); + } + megdnn_assert(data_type == Reduce::DataType::DEFAULT); + return inp_dtype; +} +} // namespace + +namespace megdnn { + +void ReduceForward::deduce_layout(const TensorLayout& src, TensorLayout& dst) { + megdnn_assert( + param().axis >= 0 && static_cast(param().axis) < src.ndim, + "axis: %d ndim: %zu", param().axis, src.ndim); + dst = src; + dst.shape[param().axis] = 1; + + dst.dtype = get_out_dtype(param().data_type, src.dtype); + dst.format = src.format; + dst.init_contiguous_stride(); +} + +void ReduceForward::check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes) { + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst); + }; + megdnn_assert(param().data_type != Reduce::DataType::FLOAT_IO16xC32, + "FLOAT_IO16xC32 is deprecated"); + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(dst); + megdnn_assert(src.ndim == dst.ndim, "%s", errmsg().c_str()); + megdnn_assert(param().axis >= 0); + uint32_t axis = param().axis; + megdnn_assert(axis < src.ndim, "%s", errmsg().c_str()); + rep(i, src.ndim) { + if (i != axis) { + megdnn_assert(src.shape[i] == dst.shape[i], "%s", errmsg().c_str()); + } else { + megdnn_assert(dst.shape[i] == 1_z, "%s", errmsg().c_str()); + } + } + megdnn_assert(src.dtype.category() == dst.dtype.category(), + "the category of reduce output and input must be the same"); + if (param().data_type == DataType::DEFAULT) { + megdnn_assert(src.dtype == dst.dtype && + (src.dtype.category() == DTypeCategory::FLOAT || + src.dtype.category() == DTypeCategory::INT || + src.dtype.category() == DTypeCategory::QUANTIZED)); + } else if (param().data_type == DataType::QUINT_I8xO32) { + megdnn_assert(src.dtype.enumv() == DTypeEnum::Quantized8Asymm); + } else if (param().data_type == DataType::QINT_I8xO32) { + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); + } else { + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); + } + + auto expected = get_out_dtype(param().data_type, src.dtype); + megdnn_assert(expected == dst.dtype); + + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/reduce_helper.cpp b/dnn/src/common/reduce_helper.cpp new file mode 100644 index 00000000..7d2618f6 --- /dev/null +++ b/dnn/src/common/reduce_helper.cpp @@ -0,0 +1,32 @@ +/** + * \file dnn/src/common/reduce_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/common/reduce_helper.h" + +#include +#include +#include "src/common/utils.h" + +namespace megdnn { +namespace reduce { + +void get_ABC(const TensorShape& shape, size_t& A, size_t& B, size_t& C, + size_t axis) { + auto shape_arr = shape.shape; + auto ndim = shape.ndim; + A = std::accumulate(shape_arr, shape_arr + axis, 1_z, + SafeMultiplies()); + B = shape_arr[axis]; + C = std::accumulate(shape_arr + (axis + 1), shape_arr + ndim, 1_z, + SafeMultiplies()); +} + +} // namespace reduce +} // namespace megdnn diff --git a/dnn/src/common/reduce_helper.h b/dnn/src/common/reduce_helper.h new file mode 100644 index 00000000..75f398bf --- /dev/null +++ b/dnn/src/common/reduce_helper.h @@ -0,0 +1,161 @@ +/** + * \file dnn/src/common/reduce_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/dtype.h" + +#if MEGDNN_CC_HOST +#include "megdnn/basic_types.h" +#endif + +namespace megdnn { +namespace reduce { + +template +struct SumOp { + typedef wtype_ wtype; + + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val; + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs + rhs; + } + MEGDNN_HOST MEGDNN_DEVICE SumOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +template +struct MeanOp { + typedef wtype_ wtype; + + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val / static_cast(B); + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs + rhs; + } + MEGDNN_HOST MEGDNN_DEVICE MeanOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +template +struct SumSqrOp { + typedef wtype_ wtype; + + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { + return static_cast(src[idx]) * static_cast(src[idx]); + } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val; + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs + rhs; + } + MEGDNN_HOST MEGDNN_DEVICE SumSqrOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(0)), src(src), dst(dst), B(B) {} +}; + +template +struct ProdOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val; + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { + return lhs * rhs; + } + MEGDNN_HOST MEGDNN_DEVICE ProdOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(1)), src(src), dst(dst), B(B) {} +}; + +template +struct MinOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val; + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { +#if defined(__CUDA_ARCH__) + return lhs < rhs ? lhs : rhs; +#else + return std::min(lhs, rhs); +#endif + } + MEGDNN_HOST MEGDNN_DEVICE MinOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(DTypeTrait::max())), src(src), dst(dst), B(B) {} +}; + +template +struct MaxOp { + typedef wtype_ wtype; + const wtype INIT; + + src_ctype* src; + dst_ctype* dst; + const size_t B; + + MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; } + MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { + dst[idx] = val; + } + static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) { +#if defined(__CUDA_ARCH__) + return lhs > rhs ? lhs : rhs; +#else + return std::max(lhs, rhs); +#endif + } + MEGDNN_HOST MEGDNN_DEVICE MaxOp(src_ctype* src, dst_ctype* dst, size_t B) + : INIT(wtype(DTypeTrait::min())), src(src), dst(dst), B(B) {} +}; + +#if MEGDNN_CC_HOST +void get_ABC(const TensorShape& shape, size_t& A, size_t& B, size_t& C, + size_t axis); +#endif + +} // namespace reduce +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/relayout.cpp b/dnn/src/common/relayout.cpp new file mode 100644 index 00000000..d29857a9 --- /dev/null +++ b/dnn/src/common/relayout.cpp @@ -0,0 +1,116 @@ +/** + * \file dnn/src/common/relayout.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" +#include "src/common/relayout_helper.h" +#include "src/common/utils.h" + +#include + +using namespace megdnn; +using namespace megdnn::relayout; + +namespace { + +//! whether current shape is [b][n][m][c] and is a transpose of contig +//! [b][m][n][c] +bool is_transpose_single(const TensorLayout& layout, TransposeParam& p) { + /* + * assuming contig layout is: + * shape: b, m, n, c + * stride: mnc, nc, c, 1 + * + * then given layout should be: + * shape: b, n, m, c + * stride: mnc, c, nc, 1 + * + * if c == 1: + * shape: b, n, m + * stride: mn, 1, n + * if b == 1: + * shape: n, m, c + * stride: c, nc, 1 + * + * if b == 1 && c == 1: + * shape: n, m + * stride: 1, n + */ + auto strd = [&](size_t idx, ptrdiff_t v) { + return layout.stride[idx] == v; + }; + if (layout.ndim == 4) { + p.batch = layout[0]; + p.n = layout[1]; + p.m = layout[2]; + p.c = layout[3]; + if (strd(3, 1) && strd(1, p.c)) { + auto t = p.c * p.n; + return strd(2, t) && strd(0, t * p.m); + } + return false; + } + if (layout.ndim == 3) { + if (strd(1, 1)) { + // c == 1 + p.batch = layout[0]; + p.n = layout[1]; + p.m = layout[2]; + p.c = 1; + return strd(2, p.n) && strd(0, p.m * p.n); + } + if (strd(2, 1)) { + // b == 1 + p.batch = 1; + p.n = layout[0]; + p.m = layout[1]; + p.c = layout[2]; + return strd(0, p.c) && strd(1, p.n * p.c); + } + return false; + } + if (layout.ndim == 2) { + p.batch = 1; + p.n = layout.shape[0]; + p.m = layout.shape[1]; + p.c = 1; + return strd(0, 1) && strd(1, p.n); + } + return false; +} + +} // anonymous namespace + +void RelayoutForward::check_layout_and_canonize(TensorLayout& src, + TensorLayout& dst) { + megdnn_assert(dst.is_non_overlapping_strong()); + src = src.collapse_contiguous(); + dst = dst.collapse_contiguous(); + megdnn_assert(src.dtype == dst.dtype && + src.total_nr_elems() == dst.total_nr_elems()); +} + +bool relayout::is_transpose(const TensorLayout& src, const TensorLayout& dst, + TransposeParam& p) { + if (is_contig(dst) && is_transpose_single(src, p)) { + // if the original intention is to transpose (m, n) to (n, m), + // then we should use (n, m) as the contig dst and use a corrsponding + // non-contig src with the same (n, m) shape (remember relayout is + // defined on element correspondence on the logical view) + return true; + } + if (is_contig(src) && is_transpose_single(dst, p)) { + std::swap(p.m, p.n); + return true; + } + return false; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/relayout_format.cpp b/dnn/src/common/relayout_format.cpp new file mode 100644 index 00000000..878f16f2 --- /dev/null +++ b/dnn/src/common/relayout_format.cpp @@ -0,0 +1,477 @@ +/** + * \file dnn/src/common/relayout_format.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" +#include "megdnn/tensor_format.h" +#include "src/common/utils.h" + +using namespace megdnn; + +void RelayoutFormat::deduce_layout_fwd(const TensorLayout& src, + TensorLayout& dst) { + using Param = param::RelayoutFormat; + switch (param().mode) { + case Param::Mode::NCHW_NHWCD4: + case Param::Mode::NCHW_NHWCD4I: + dst.ndim = 5; + dst[0] = src[0]; + dst[1] = src[2]; + dst[2] = (src[1] + 3) / 4; + dst[3] = src[3]; + dst[4] = 4; + break; + case Param::Mode::NCHW_NCHW88: + dst.ndim = 5; + dst[0] = src[0]; + dst[1] = div_ceil(src[1], 8_z); + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = 8; + break; + case Param::Mode::NCHW88_NCHW: + dst.ndim = 4; + dst[0] = src[0]; + dst[1] = src[1] * 8; + dst[2] = src[2]; + dst[3] = src[3]; + break; + case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT: + megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 4"); + dst.ndim = 6; + megdnn_assert(src[0] % 8 == 0, + "NCHW_NCHW88_CONV_DENSE_WEIGHT out channel must " + "align to 8"); + dst[0] = src[0] / 8; + dst[1] = div_ceil(src[1], 8_z); + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = 8; + dst[5] = 8; + break; + case Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT: + megdnn_assert(src.ndim == 5, "src must be goihw, ndim == 5"); + dst.ndim = 6; + dst[0] = div_ceil(src[0], 8_z); + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = 8; + break; + case Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT: + megdnn_assert(src.ndim == 5, "src must be goihw, ndim == 5"); + dst.ndim = 7; + dst[0] = src[0]; + megdnn_assert(src[1] % 8 == 0, + "NCHW_NCHW88_CONV_GROUP_WEIGHT out channel must " + "align to 8"); + dst[1] = src[1] / 8; + dst[2] = div_ceil(src[2], 8_z); + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = 8; + dst[6] = 8; + break; + case Param::Mode::NHWC_NHWCD4: + case Param::Mode::NHWC_NHWCD4I: + megdnn_assert(src.ndim == 4); + //! channel mod 4 should == 4 + megdnn_assert(src[3] % 4 == 0); + dst.ndim = 5; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[3] / 4; + dst[3] = src[2]; + dst[4] = 4; + break; + case Param::Mode::NHWCD4_NHWC: + megdnn_assert(src.ndim == 5); + dst.ndim = 4; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[3]; + dst[3] = src[2] * 4; + break; + case Param::Mode::NHWCD4_NCHW: + case Param::Mode::NHWCD4I_NCHW: + megdnn_assert(src.ndim == 5); + dst.ndim = 4; + dst[0] = src[0]; + dst[1] = src[2] * 4; + dst[2] = src[1]; + dst[3] = src[3]; + break; + case Param::Mode::INTER_WEIGHT_DENSE: + case Param::Mode::INTER_WEIGHT_DENSEI: + megdnn_assert(src.ndim == 4); + megdnn_assert(src[0] % 4 == 0); + dst.ndim = 5; + dst[0] = src[0] / 4; + dst[1] = src[2]; + dst[2] = src[3]; + dst[3] = round_up(src[1], 4); + dst[4] = 4; + break; + case Param::Mode::INTER_WEIGHT_GROUP: + case Param::Mode::INTER_WEIGHT_GROUPI: + // group conv filter + megdnn_assert(src.ndim == 5); + megdnn_assert(src[1] % 4 == 0 && src[2] % 4 == 0); + dst.ndim = 6; + dst[0] = src[0]; + dst[1] = src[1] / 4; + dst[2] = src[3]; + dst[3] = src[4]; + dst[4] = src[2]; + dst[5] = 4; + break; + case Param::Mode::INTER_WEIGHT_CHAN: + case Param::Mode::INTER_WEIGHT_CHANI: + megdnn_assert(src.ndim == 5 && src[1] == 1 && src[2] == 1); + // chanwise conv filter + dst.ndim = 5; + dst[0] = src[0] / 4; + dst[1] = 1; + dst[2] = src[3]; + dst[3] = src[4]; + dst[4] = 4; + break; + case Param::Mode::INTER_WEIGHT_DENSEI_DOT: + megdnn_assert(src.ndim == 4); + megdnn_assert(src[0] % 4 == 0); + dst.ndim = 6; + dst[0] = src[0] / 4; + dst[1] = src[2]; + dst[2] = src[3]; + dst[3] = div_ceil(src[1], 4); + dst[4] = 4; + dst[5] = 4; + break; + case Param::Mode::INTER_WEIGHT_GROUPI_DOT: + megdnn_assert(src.ndim == 5); + megdnn_assert(src[1] % 4 == 0 && src[2] % 4 == 0); + dst.ndim = 7; + dst[0] = src[0]; + dst[1] = src[1] / 4; + dst[2] = src[3]; + dst[3] = src[4]; + dst[4] = src[2] / 4; + dst[5] = 4; + dst[6] = 4; + break; + case Param::Mode::NCHW4_CHWN4: + megdnn_assert(src.ndim == 5); + megdnn_assert(src[4] == 4); + dst.ndim = 5; + dst[0] = src[1]; + dst[1] = src[2]; + dst[2] = src[3]; + dst[3] = src[0]; + dst[4] = src[4]; + break; + case Param::Mode::CHWN4_NCHW4: + megdnn_assert(src.ndim == 5); + megdnn_assert(src[4] == 4); + dst.ndim = 5; + dst[0] = src[3]; + dst[1] = src[0]; + dst[2] = src[1]; + dst[3] = src[2]; + dst[4] = src[4]; + break; + default: + megdnn_assert(0, "Invalid RelayoutFormat Mode"); + break; + } + TensorFormat dst_fmt; + deduce_format(src.format, dst_fmt); + dst.format = dst_fmt; + dst.dtype = src.dtype; + dst.init_contiguous_stride(); +} + +void RelayoutFormat::deduce_layout(const TensorLayout& src, TensorLayout& dst) { + deduce_layout_fwd(src, dst); +} + +void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) { + size_t align = handle()->image2d_pitch_alignment(); + using Param = param::RelayoutFormat; +#define CHECK_SRC(_expect) \ + megdnn_assert(src == _expect, "invalid src format: expect=%s got=%s", \ + _expect.to_string().c_str(), src.to_string().c_str()) + switch (param().mode) { + case Param::Mode::NHWC_NHWCD4: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::NHWCD4_NHWC: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::NHWC_NHWCD4I: + CHECK_SRC(DefaultTensorFormat::make()); + dst = Image2DPack4TensorFormat::make_raw(2, align); + break; + case Param::Mode::NCHW_NHWCD4: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::NCHW_NHWCD4I: + CHECK_SRC(DefaultTensorFormat::make()); + dst = Image2DPack4TensorFormat::make_raw(2, align); + break; + case Param::Mode::NHWCD4I_NCHW: + CHECK_SRC(Image2DPack4TensorFormat::make_raw(2, align)); + dst = DefaultTensorFormat::make(); + break; + case Param::Mode::NHWCD4_NCHW: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::INTER_WEIGHT_DENSE: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::INTER_WEIGHT_DENSEI: + case Param::Mode::INTER_WEIGHT_DENSEI_DOT: + CHECK_SRC(DefaultTensorFormat::make()); + dst = Image2DPack4TensorFormat::make_raw(3, align); + break; + case Param::Mode::INTER_WEIGHT_GROUP: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::INTER_WEIGHT_GROUPI: + case Param::Mode::INTER_WEIGHT_GROUPI_DOT: + CHECK_SRC(DefaultTensorFormat::make()); + dst = Image2DPack4TensorFormat::make_raw(4, align); + break; + case Param::Mode::INTER_WEIGHT_CHAN: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::INTER_WEIGHT_CHANI: + CHECK_SRC(DefaultTensorFormat::make()); + dst = Image2DPack4TensorFormat::make_raw(1, align); + break; + case Param::Mode::NCHW4_CHWN4: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::CHWN4_NCHW4: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + case Param::Mode::NCHW_NCHW88: + case Param::Mode::NCHW88_NCHW: + case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT: + case Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT: + case Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT: + CHECK_SRC(DefaultTensorFormat::make()); + dst = src; + break; + + default: + megdnn_throw("Invalid relayout format mode"); + break; + } +#undef CHECK_SRC +} + +void RelayoutFormat::check_layout_fwd(const TensorLayout& src, + const TensorLayout& dst) { + TensorLayout dst_expected; + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); +} + +void RelayoutFormat::check_exec(const TensorLayout& src, + const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void RelayoutFormat::deduce_exec_layout(const TensorLayout& src, + const TensorLayout& dst, + TensorLayout& exec_src, + TensorLayout& exec_dst) { + check_layout_fwd(src, dst); + using Param = param::RelayoutFormat; + switch (param().mode) { + case Param::Mode::NCHW_NCHW88: + // nchw to nchw8c + { + TensorLayout work_space_layout( + {src[0], round_up(src[1], 8_z), src[2], src[3]}, + src.dtype, src.format); + exec_src = work_space_layout + .reshape({src[0], div_ceil(src[1], 8_z), 8, + src[2], src[3]}) + .dimshuffle({0, 1, 3, 4, 2}); + exec_dst = dst; + } + break; + case Param::Mode::NCHW88_NCHW: + // nchw8c to nchw + exec_src = src; + exec_dst = dst.reshape({dst[0], dst[1] / 8, 8, dst[2], dst[3]}) + .dimshuffle({0, 1, 3, 4, 2}); + break; + case Param::Mode::NCHW_NCHW88_CONV_DENSE_WEIGHT: + // oihw to oihw8i8o + { + megdnn_assert(src.ndim == 4); + megdnn_assert(src[0] % 8 == 0); + TensorLayout work_space_layout( + {src[0], round_up(src[1], 8_z), src[2], src[3]}, + src.dtype, src.format); + exec_src = + work_space_layout + .reshape({src[0] / 8, 8, div_ceil(src[1], 8_z), + 8, src[2], src[3]}) + .dimshuffle({0, 2, 4, 5, 3, 1}); + exec_dst = dst; + } + break; + case Param::Mode::NCHW_NCHW88_CONV_CHAN_WEIGHT: + // goihw to goihw8g + { + megdnn_assert(src.ndim == 5); + TensorLayout work_space_layout( + {round_up(src[0], 8_z), src[1], src[2], src[3], src[4]}, + src.dtype, src.format); + exec_src = work_space_layout + .reshape({div_ceil(src[0], 8_z), 8, src[1], + src[2], src[3], src[4]}) + .dimshuffle({0, 2, 3, 4, 5, 1}); + exec_dst = dst; + } + break; + case Param::Mode::NCHW_NCHW88_CONV_GROUP_WEIGHT: + // goihw to goihw8i8o + { + megdnn_assert(src.ndim == 5); + megdnn_assert(src[1] % 8 == 0); + TensorLayout work_space_layout( + {src[0], src[1], round_up(src[2], 8_z), src[3], src[4]}, + src.dtype, src.format); + exec_src = work_space_layout + .reshape({src[0], src[1] / 8, 8, + div_ceil(src[2], 8_z), 8, src[3], + src[4]}) + .dimshuffle({0, 1, 3, 5, 6, 4, 2}); + exec_dst = dst; + } + break; + case Param::Mode::NCHW_NHWCD4: + case Param::Mode::NCHW_NHWCD4I: + // src is {N, C, H, W} + // dst is {N, H, CB, W, 4} + exec_src = src; + exec_src[1] = (exec_src[1] + 3) / 4 * 4; + exec_src.stride[0] = exec_src[1] * exec_src.stride[1]; + exec_src = exec_src.dimshuffle({0, 2, 3, 1}); + exec_src = exec_src.reshape({exec_src[0], exec_src[1], exec_src[2], + exec_src[3] / 4, 4}) + .dimshuffle({0, 1, 3, 2, 4}); + exec_dst = dst; + break; + case Param::Mode::NHWC_NHWCD4: + case Param::Mode::NHWC_NHWCD4I: + // src is {N, H, W, C}, + // dst is {N, H, CB, W, 4} + exec_src = src.reshape({src[0], src[1], src[2], src[3] / 4, 4}) + .dimshuffle({0, 1, 3, 2, 4}); + exec_dst = dst; + break; + case Param::Mode::NHWCD4_NHWC: + // src is {N, H, CB, W, 4} + // dst is {N, H, W, C}, + exec_src = src; + exec_dst = dst.reshape({dst[0], dst[1], dst[2], dst[3] / 4, 4}) + .dimshuffle({0, 1, 3, 2, 4}); + break; + case Param::Mode::NHWCD4_NCHW: + case Param::Mode::NHWCD4I_NCHW: + exec_src = src; + exec_dst = dst.reshape({dst[0], dst[1] / 4, 4, dst[2], dst[3]}) + .dimshuffle({0, 3, 1, 4, 2}); + break; + case Param::Mode::INTER_WEIGHT_DENSE: + case Param::Mode::INTER_WEIGHT_DENSEI: + // src is {OC, IC, FH, FW} + // dst is {OCB, FH, FW, IC, 4} + exec_src = src.reshape({src[0] / 4, 4, src[1], src[2], src[3]}) + .dimshuffle({0, 3, 4, 2, 1}); + exec_dst = dst; + // dst[3] may be round_uped, set to the real ic + exec_dst.shape[3] = src[1]; + break; + case Param::Mode::INTER_WEIGHT_GROUP: + case Param::Mode::INTER_WEIGHT_GROUPI: + // group conv filter + // src is {G, ocpg, icpg, fh, fw} + // dst is {G, ocpgb, fh, fw, icpg, 4} + exec_src = + src.reshape({src[0], src[1] / 4, 4, src[2], src[3], src[4]}) + .dimshuffle({0, 1, 4, 5, 3, 2}); + exec_dst = dst; + break; + case Param::Mode::INTER_WEIGHT_CHAN: + case Param::Mode::INTER_WEIGHT_CHANI: + megdnn_assert(src.ndim == 5); + megdnn_assert(src[1] == 1 && src[2] == 1); + // chanwise conv filter + megdnn_assert(src[0] % 4 == 0); + exec_src = src.reshape({src[0] / 4, 4, 1, src[3], src[4]}) + .dimshuffle({0, 2, 3, 4, 1}); + exec_dst = dst; + break; + case Param::Mode::INTER_WEIGHT_DENSEI_DOT: + // src is {oc, ic, fh , fw} + // dst is {oc/4, fh, fw, ic/4, 4, 4} + exec_src = src; + exec_src[1] = round_up(src[1], 4); + exec_src.stride[0] = exec_src.stride[1] * exec_src[1]; + exec_src = exec_src.reshape({exec_src[0] / 4, 4, exec_src[1] / 4, 4, + exec_src[2], exec_src[3]}) + .dimshuffle({0, 4, 5, 2, 1, 3}); + exec_dst = dst; + break; + case Param::Mode::INTER_WEIGHT_GROUPI_DOT: + // src is {G, ocpg, icpg, fh, fw} + // dst is {G, ocpg/4, fh, fw, icpg/4, 4, 4} + exec_src = src.reshape({src[0], src[1] / 4, 4, src[2] / 4, 4, + src[3], src[4]}) + .dimshuffle({0, 1, 5, 6, 3, 2, 4}); + exec_dst = dst; + break; + case Param::Mode::NCHW4_CHWN4: + // src is {N, C/4, H, W, 4} + // dst is {C/4, H, W, N, 4} + exec_src = src.dimshuffle({1, 2, 3, 0, 4}); + exec_dst = dst; + break; + case Param::Mode::CHWN4_NCHW4: + // src is {C/4, H, W, N, 4} + // dst is {N, C/4, H, W, 4} + exec_src = src.dimshuffle({3, 0, 1, 2, 4}); + exec_dst = dst; + break; + default: + megdnn_assert(0, "Invalid RelayoutFormat Mode"); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/relayout_helper.h b/dnn/src/common/relayout_helper.h new file mode 100644 index 00000000..5c1e0d02 --- /dev/null +++ b/dnn/src/common/relayout_helper.h @@ -0,0 +1,143 @@ +/** + * \file dnn/src/common/relayout_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +namespace megdnn { +namespace relayout { + +static inline bool is_contig(const TensorLayout& layout) { + return layout.ndim == 1 && layout.stride[0] == 1; +} + +//! [b][m][n][c] to [b][n][m][c] +struct TransposeParam { + size_t batch, m, n, c; +}; + +/** + * \brief whether the relayout can be formulated as TransposeParam + * + * Note that \p src and \p dst should have been processed by + * RelayoutForward::check_layout_and_canonize + */ +bool is_transpose(const TensorLayout& src, const TensorLayout& dst, + TransposeParam& p); + +namespace transpose_fallback { + +#if MEGDNN_X86 +constexpr size_t BLOCK_LINE_SIZE_BYTES = 64; +#else +#error "unknown megdnn arch" +#endif + +/** + * \brief transpose traits + * \tparam T element type + */ +template +struct transpose_traits { + static constexpr size_t block_size = BLOCK_LINE_SIZE_BYTES / sizeof(T); +}; + +template +void transpose_block_fallback(const T* src, T* dst, const size_t src_stride, + const size_t dst_stride, size_t block_h, + size_t block_w) { + constexpr size_t block_size = transpose_traits::block_size; + T block[block_size][block_size]; + + for (size_t i = 0; i < block_h; ++i) { + auto src_ptr = src + i * src_stride; + for (size_t j = 0; j < block_w; ++j) { + block[j][i] = src_ptr[j]; + } + } + for (size_t i = 0; i < block_w; ++i) { + auto dst_ptr = dst + i * dst_stride; + for (size_t j = 0; j < block_h; ++j) { + dst_ptr[j] = block[i][j]; + } + } +} + +template +void transpose_block(const T* src, T* dst, const size_t src_stride, + const size_t dst_stride, size_t block_h, size_t block_w) { + transpose_block_fallback(src, dst, src_stride, dst_stride, block_h, + block_w); +} + +/*! + * \brief transpose a single block whose size is transpose_traits::block_size + * + * This function and transpose_traits can be specialized to implement optimized + * block transpose + */ +template +void transpose_block(const T* src, T* dst, const size_t src_stride, + const size_t dst_stride) { + constexpr size_t block_size = transpose_traits::block_size; + transpose_block_fallback(src, dst, src_stride, dst_stride, block_size, + block_size); +} + +/*! + * \brief transpose contiguous (batch, m, n) to (batch, n, m) + */ +template +void transpose(size_t batch, size_t m, size_t n, T* src, T* dst) { + auto batch_src = src; + auto batch_dst = dst; + constexpr size_t B = transpose_traits::block_size; + + auto work_block = [m, n, &batch_src, &batch_dst]( + const size_t i, const size_t j, const size_t h, + const size_t w) { + + auto src = batch_src + i * n + j, dst = batch_dst + j * m + i; + if (h == B && w == B) { + transpose_block(src, dst, n, m); + } else { + transpose_block(src, dst, n, m, h, w); + } + }; + auto work_row = [&work_block, n](size_t i, size_t h) { + size_t j = 0; + for (; j + B <= n; j += B) { + work_block(i, j, h, B); + } + if (j < n) { + work_block(i, j, h, n - j); + } + }; + + for (size_t b = 0; b < batch; ++b) { + size_t i = 0; + for (; i + B <= m; i += B) { + work_row(i, B); + } + if (i < m) { + work_row(i, m - i); + } + batch_src += m * n; + batch_dst += m * n; + } +} +} // namespace transpose_fallback + +} // namespace relayout +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/resize.cpp b/dnn/src/common/resize.cpp new file mode 100644 index 00000000..41d28896 --- /dev/null +++ b/dnn/src/common/resize.cpp @@ -0,0 +1,84 @@ +/** + * \file dnn/src/common/resize.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void ResizeBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& dst) { + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + ", " + megdnn_layout_msg(dst); + }; + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert(dst.dtype == src.dtype && dst.shape[0] == src.shape[0], "%s", + errmsg().c_str()); + if (param().format == Param::Format::NCHW) { + megdnn_assert(dst.shape[1] == src.shape[1], "%s", errmsg().c_str()); + megdnn_assert(param().imode == + param::Resize::InterpolationMode::INTER_LINEAR); + } else if (param().format == Param::Format::NHWC) { + megdnn_assert(dst.shape[3] == src.shape[3], "%s", errmsg().c_str()); + } else if (param().format == Param::Format::NCHW4) { + megdnn_assert(src.ndim == 5); + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); + megdnn_assert(src.shape[4] == 4); + megdnn_assert(dst.shape[4] == 4); + } else { + megdnn_assert(param().format == Param::Format::NHWCD4, + "invalid resize tensor format"); + megdnn_assert(param().imode == + param::Resize::InterpolationMode::INTER_LINEAR); + megdnn_assert(dst.shape[2] == src.shape[2], "%s", errmsg().c_str()); + } +} + +void Resize::check_exec(const TensorLayout& src, const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void ResizeBackward::check_exec(const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_in_bytes) { + check_layout_fwd(grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + megdnn_assert(param().format == Param::Format::NCHW && + grad.dtype == dtype::Float32(), + "Backward resize only supports Float32 and NCHW."); +} + +std::pair ResizeBase::get_origin_coord(float scale, int size, + int idx) { + //! copy from resize_cv.cpp + float alpha = (idx + 0.5f) / scale - 0.5f; + int origin_idx = static_cast(floor(alpha)); + alpha -= origin_idx; + if (origin_idx < 0) { + origin_idx = 0; + alpha = 0; + } else if (origin_idx + 1 >= size) { + origin_idx = size - 2; + alpha = 1; + } + + return {alpha, origin_idx}; +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/rng.cpp b/dnn/src/common/rng.cpp new file mode 100644 index 00000000..83a0b2b1 --- /dev/null +++ b/dnn/src/common/rng.cpp @@ -0,0 +1,28 @@ +/** + * \file dnn/src/common/rng.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void RNGBase::check_exec( + const TensorLayout &dst, size_t workspace_in_bytes) { + megdnn_assert(dst.dtype.category() == DTypeCategory::FLOAT && + dst.is_contiguous()); + megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(dst)); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/roi_align.cpp b/dnn/src/common/roi_align.cpp new file mode 100644 index 00000000..ce62f2b0 --- /dev/null +++ b/dnn/src/common/roi_align.cpp @@ -0,0 +1,89 @@ +/** + * \file dnn/src/common/roi_align.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void ROIAlignBase::deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& rois, + TensorLayout& dst, TensorLayout& index) { + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(rois); + megdnn_assert_contiguous(dst); + megdnn_assert_contiguous(index); + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(rois) + ", " + + megdnn_layout_msg(dst) + ", " + megdnn_layout_msg(index); + }; + MEGDNN_MARK_USED_VAR(errmsg); + using Format = ROIAlignBase::Param::Format; + megdnn_assert(param().format == Format::NCHW); + auto src_dtype = src.dtype, rois_dtype = rois.dtype; + megdnn_assert(src_dtype == rois_dtype && + src_dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + size_t channels = src.shape[1]; + megdnn_assert(rois.ndim == 2_z, "%s", errmsg().c_str()); + // rois shape: bid, x0, y0, x1, y1 + megdnn_assert(rois[1] == 5_z, "%s", errmsg().c_str()); + size_t M = rois[0]; + size_t pooled_height = param().pooled_height; + size_t pooled_width = param().pooled_width; + dst = TensorLayout{{M, channels, pooled_height, pooled_width}, src.dtype}; + index = dst; + index.dtype = dtype::Int32(); +} + +void ROIAlignBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& rois, + const TensorLayout& dst, + const TensorLayout& index) { + TensorLayout dst_expected, index_expected; + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, rois, dst_expected, index_expected); + megdnn_assert_eq_shape(dst_expected, dst); + megdnn_assert_eq_shape(index_expected, index); + megdnn_assert(index.dtype == dtype::Int32()); +} + +void ROIAlignForward::deduce_layout(const TensorLayout& src, + const TensorLayout& rois, TensorLayout& dst, + TensorLayout& index) { + deduce_layout_fwd(src, rois, dst, index); +} + +void ROIAlignForward::check_exec(const TensorLayout& src, + const TensorLayout& rois, + const TensorLayout& dst, + const TensorLayout& index, + size_t workspace_in_bytes) { + check_layout_fwd(src, rois, dst, index); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, rois, dst, index); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void ROIAlignBackward::check_exec(const TensorLayout& diff, + const TensorLayout& rois, + const TensorLayout& index, + const TensorLayout& grad, + size_t workspace_in_bytes) { + check_layout_fwd(grad, rois, diff, index); + auto required_workspace_in_bytes = + get_workspace_in_bytes(diff, rois, index, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/roi_align_helper.h b/dnn/src/common/roi_align_helper.h new file mode 100644 index 00000000..00b60697 --- /dev/null +++ b/dnn/src/common/roi_align_helper.h @@ -0,0 +1,215 @@ +/** + * \file dnn/src/common/roi_align_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/dtype.h" + +#if MEGDNN_CC_CUDA +#include "src/cuda/utils.cuh" +#endif + +namespace megdnn { +namespace roi_align { + +template +MEGDNN_HOST MEGDNN_DEVICE T bilinear_interp(const T* data, const float h, + const float w, const int height, + const int width) { + int h0 = floorf(h), w0 = floorf(w), h1 = h0 + 1, w1 = w0 + 1; + T top_left = (h0 >= 0 && h0 < height && w0 >= 0 && w0 < width) + ? data[h0 * width + w0] + : T(0.f); + T top_right = (h0 >= 0 && h0 < height && w1 >= 0 && w1 < width) + ? data[h0 * width + w1] + : T(0.f); + T bottom_left = (h1 >= 0 && h1 < height && w0 >= 0 && w0 < width) + ? data[h1 * width + w0] + : T(0.f); + T bottom_right = (h1 >= 0 && h1 < height && w1 >= 0 && w1 < width) + ? data[h1 * width + w1] + : T(0.f); + T top = top_left + (top_right - top_left) * static_cast(w - w0); + T bottom = + bottom_left + (bottom_right - bottom_left) * static_cast(w - w0); + T res = top + (bottom - top) * static_cast(h - h0); + return res; +} + +template +MEGDNN_HOST MEGDNN_DEVICE void distribute_diff(T* diff, const T top_diff, + const float h, const float w, + const int height, + const int width) { +#if MEGDNN_CC_CUDA + using namespace ::megdnn::cuda; +#endif + int h0 = floorf(h), w0 = floorf(w), h1 = h0 + 1, w1 = w0 + 1; + if (h0 >= 0 && h0 < height) { + if (w0 >= 0 && w0 < width) { + T val = top_diff * static_cast((h1 - h) * (w1 - w)); +#if MEGDNN_CC_CUDA + atomic_add(&diff[h0 * width + w0], val); +#else + diff[h0 * width + w0] += val; +#endif + } + if (w1 >= 0 && w1 < width) { + T val = top_diff * static_cast((h1 - h) * (w - w0)); +#if MEGDNN_CC_CUDA + atomic_add(&diff[h0 * width + w1], val); +#else + diff[h0 * width + w1] += val; +#endif + } + } + if (h1 >= 0 && h1 < height) { + if (w0 >= 0 && w0 < width) { + T val = top_diff * static_cast((h - h0) * (w1 - w)); +#if MEGDNN_CC_CUDA + atomic_add(&diff[h1 * width + w0], val); +#else + diff[h1 * width + w0] += val; +#endif + } + if (w1 >= 0 && w1 < width) { + T val = top_diff * static_cast((h - h0) * (w - w0)); +#if MEGDNN_CC_CUDA + atomic_add(&diff[h1 * width + w1], val); +#else + diff[h1 * width + w1] += val; +#endif + } + } +} + +template +struct MaxPooler { + T maxval; + int maxidx; + size_t cnt; + MEGDNN_HOST MEGDNN_DEVICE MaxPooler() + : maxval(DTypeTrait::min()), maxidx(-1), cnt(0) {} + MEGDNN_HOST MEGDNN_DEVICE void feed(T val, int idx) { + ++cnt; + if (val > maxval) { + maxval = val; + maxidx = idx; + } + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_val(T& val) { + val = cnt > 0 ? maxval : 0; + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_idx(int& idx) { idx = maxidx; } +}; + +template +struct AveragePooler { + T sum; + size_t cnt; + MEGDNN_HOST MEGDNN_DEVICE AveragePooler() : sum(T(0)), cnt(0) {} + MEGDNN_HOST MEGDNN_DEVICE void feed(T val, int) { + sum += val; + ++cnt; + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_val(T& val) { + val = cnt > 0 ? sum / T(cnt) : 0; + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_idx(int&) {} +}; + +template +struct BwdPooler { + int ph, pw; + int sample_height, sample_width; + int height, width; + float roi_start_h, roi_start_w, bin_size_h, bin_size_w; + float sample_h_rate, sample_w_rate; + MEGDNN_HOST MEGDNN_DEVICE BwdPooler(int ph, int pw, int sample_height, + int sample_width, int height, int width, + float roi_start_h, float roi_start_w, + float bin_size_h, float bin_size_w) + : ph{ph}, + pw{pw}, + sample_height{sample_height}, + sample_width{sample_width}, + height{height}, + width{width}, + roi_start_h{roi_start_h}, + roi_start_w{roi_start_w}, + bin_size_h{bin_size_h}, + bin_size_w{bin_size_w} { + sample_h_rate = 1.0f / ((float)(sample_height)); + sample_w_rate = 1.0f / ((float)(sample_width)); + } +}; + +template +struct BwdMaxPooler : public BwdPooler { + using Super = BwdPooler; + MEGDNN_HOST MEGDNN_DEVICE BwdMaxPooler(int ph, int pw, int sample_height, + int sample_width, int height, + int width, float roi_start_h, + float roi_start_w, float bin_size_h, + float bin_size_w) + : BwdPooler{ph, pw, sample_height, sample_width, + height, width, roi_start_h, roi_start_w, + bin_size_h, bin_size_w} {} + MEGDNN_HOST MEGDNN_DEVICE void update(int index, const T* diff, + const int* argmax, T* grad) { + int h_iter = argmax[index] / Super::sample_width; + int w_iter = argmax[index] - Super::sample_width * h_iter; + float hcenter = + Super::roi_start_h + + Super::bin_size_h * + (Super::ph + Super::sample_h_rate * (h_iter + 0.5f)); + float wcenter = + Super::roi_start_w + + Super::bin_size_w * + (Super::pw + Super::sample_w_rate * (w_iter + 0.5f)); + distribute_diff(grad, diff[index], hcenter, wcenter, Super::height, + Super::width); + } +}; + +template +struct BwdAveragePooler : public BwdPooler { + using Super = BwdPooler; + MEGDNN_HOST MEGDNN_DEVICE + BwdAveragePooler(int ph, int pw, int sample_height, int sample_width, + int height, int width, float roi_start_h, + float roi_start_w, float bin_size_h, float bin_size_w) + : BwdPooler{ph, pw, sample_height, sample_width, + height, width, roi_start_h, roi_start_w, + bin_size_h, bin_size_w} {} + MEGDNN_HOST MEGDNN_DEVICE void update(int index, const T* diff, + const int* /* argmax */, T* grad) { + int cnt = Super::sample_height * Super::sample_width; + for (int h_iter = 0; h_iter < Super::sample_height; ++h_iter) { + for (int w_iter = 0; w_iter < Super::sample_width; ++w_iter) { + float hcenter = Super::roi_start_h + + Super::bin_size_h * + (Super::ph + Super::sample_h_rate * + (h_iter + 0.5f)); + float wcenter = Super::roi_start_w + + Super::bin_size_w * + (Super::pw + Super::sample_w_rate * + (w_iter + 0.5f)); + T val = diff[index] / static_cast(cnt); + distribute_diff(grad, val, hcenter, wcenter, Super::height, + Super::width); + } + } + } +}; + +} // namespace roi_align +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/roi_copy.cpp b/dnn/src/common/roi_copy.cpp new file mode 100644 index 00000000..9f5eb25e --- /dev/null +++ b/dnn/src/common/roi_copy.cpp @@ -0,0 +1,57 @@ +/** + * \file dnn/src/common/roi_copy.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void ROICopyBase::deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst) +{ + size_t in = src.shape[0]; + size_t ih = src.shape[1]; + size_t iw = src.shape[2]; + size_t ic = src.shape[3]; + + megdnn_assert(param().row_to <= ih && param().row_to > param().row_from); + megdnn_assert(param().col_to <= iw && param().col_to > param().col_from); + megdnn_assert(ic == 1_z || ic == 3_z); + size_t oh = param().row_to - param().row_from; + size_t ow = param().col_to - param().col_from; + + dst = TensorLayout(TensorShape({in, oh, ow, ic}), src.dtype); +} + +void ROICopyBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_shape(dst_expected, dst); +} + +void ROICopy::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void ROICopy::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/roi_pooling.cpp b/dnn/src/common/roi_pooling.cpp new file mode 100644 index 00000000..876d5349 --- /dev/null +++ b/dnn/src/common/roi_pooling.cpp @@ -0,0 +1,81 @@ +/** + * \file dnn/src/common/roi_pooling.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void ROIPoolingBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &rois, + const TensorLayout &dst, + const TensorLayout &index) +{ + // all should be contiguous + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(rois); + megdnn_assert_contiguous(dst); + megdnn_assert_contiguous(index); + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + + megdnn_layout_msg(rois) + ", " + + megdnn_layout_msg(dst) + ", " + + megdnn_layout_msg(index); + }; + MEGDNN_MARK_USED_VAR(errmsg); + // src + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + auto C = src.shape[1]; + // rois + megdnn_assert(rois.ndim == 2_z, "%s", errmsg().c_str()); + auto M = rois.shape[0]; + megdnn_assert(rois[1] == 5_z, "%s", errmsg().c_str()); + // dst + megdnn_assert(dst[0] == M, "%s", errmsg().c_str()); + megdnn_assert(dst[1] == C, "%s", errmsg().c_str()); + // index + megdnn_assert_eq_shape(index, dst); + + megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(rois.dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(dst.dtype.category() == DTypeCategory::FLOAT); + megdnn_assert(index.dtype == dtype::Int32()); +} + +void ROIPoolingForward::check_exec(const TensorLayout &src, + const TensorLayout &rois, + const TensorLayout &dst, + const TensorLayout &index, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, rois, dst, index); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, + rois, dst, index); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void ROIPoolingBackward::check_exec(const TensorLayout &diff, + const TensorLayout &src, + const TensorLayout &rois, + const TensorLayout &index, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, rois, diff, index); + megdnn_assert_eq_layout(src, grad); + auto required_workspace_in_bytes = get_workspace_in_bytes(diff, + src, rois, index, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/roi_pooling_helper.h b/dnn/src/common/roi_pooling_helper.h new file mode 100644 index 00000000..6fda0592 --- /dev/null +++ b/dnn/src/common/roi_pooling_helper.h @@ -0,0 +1,120 @@ +/** + * \file dnn/src/common/roi_pooling_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/dtype.h" + +namespace megdnn { +namespace roi_pooling { + +template struct MaxPooler { + T maxval; + int maxidx; + size_t cnt; + MEGDNN_HOST MEGDNN_DEVICE MaxPooler(): + maxval(DTypeTrait::min()), + maxidx(-1), + cnt(0) + {} + MEGDNN_HOST MEGDNN_DEVICE void feed(T val, int idx) + { + ++cnt; + if (val > maxval) { + maxval = val; + maxidx = idx; + } + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_val(T &val) + { + val = cnt > 0 ? maxval : 0; + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_idx(int &idx) + { + idx = maxidx; + } +}; +template struct AveragePooler { + T sum; + size_t cnt; + MEGDNN_HOST MEGDNN_DEVICE AveragePooler(): + sum(T(0)), cnt(0) + {} + MEGDNN_HOST MEGDNN_DEVICE void feed(T val, int) + { + sum += val; + ++cnt; + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_val(T &val) + { + val = cnt > 0 ? sum / T(cnt) : 0; + } + MEGDNN_HOST MEGDNN_DEVICE void writeback_idx(int &) + { + } +}; + +template struct BwdMaxPooler { + MEGDNN_HOST MEGDNN_DEVICE void update( + int ph, int pw, int h, int w, + float /* bin_size_h */, float /* bin_size_w */, + int /* roi_start_h */, int /* roi_start_w */, + size_t /* pooled_height */, size_t pooled_width, + size_t /* height */, size_t width, + const T *offset_src_diff, + const int *offset_fp_idx, + T &gradient) + { + if (offset_fp_idx[ph * pooled_width + pw] == + (int)(h * width + w)) { + gradient += offset_src_diff[ph * pooled_width + pw]; + } + } +}; + +template struct BwdAveragePooler +{ + MEGDNN_HOST MEGDNN_DEVICE void update( + int ph, int pw, int h, int w, float bin_size_h, float bin_size_w, + int roi_start_h, int roi_start_w, + size_t /* pooled_height */, size_t pooled_width, + size_t height, size_t width, + const T *offset_src_diff, + const int * /* offset_fp_idx */, + T &gradient) + { +#if MEGDNN_CC_HOST + using std::min; + using std::max; +#endif + int hstart = static_cast(floor(static_cast(ph) + * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) + * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) + * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) + * bin_size_w)); + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart + roi_start_h, 0), (int)height); + hend = min(max(hend + roi_start_h, 0), (int)height); + wstart = min(max(wstart + roi_start_w, 0), (int)width); + wend = min(max(wend + roi_start_w, 0), (int)width); + int size = (hend - hstart) * (wend - wstart); + float inv_size = 1.0f / size; + if (h >= hstart && h < hend && w >= wstart && w < wend) { + gradient += offset_src_diff[ph * pooled_width + pw] * inv_size; + } + } +}; + +} // namespace roi_pooling +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/rotate.cpp b/dnn/src/common/rotate.cpp new file mode 100644 index 00000000..69db12e5 --- /dev/null +++ b/dnn/src/common/rotate.cpp @@ -0,0 +1,58 @@ +/** + * \file dnn/src/common/rotate.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void RotateBase::deduce_layout_fwd(const TensorLayout &src, TensorLayout &dst) +{ + auto errmsg = [&]() { return megdnn_layout_msg(src); }; + MEGDNN_MARK_USED_VAR(errmsg); + + megdnn_assert(src.ndim == 4_z && (src.shape[3] == 1_z || + src.shape[3] == 3_z), "%s", errmsg().c_str()); + + size_t in = src.shape[0]; + size_t ih = src.shape[1]; + size_t iw = src.shape[2]; + size_t ic = src.shape[3]; + + dst = TensorLayout(TensorShape({in, iw, ih, ic}), src.dtype); +} + +void RotateBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, dst_expected); + megdnn_assert_eq_shape(dst_expected, dst); +} + +void Rotate::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void Rotate::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/rounding_converter.cuh b/dnn/src/common/rounding_converter.cuh new file mode 100644 index 00000000..5a1c6327 --- /dev/null +++ b/dnn/src/common/rounding_converter.cuh @@ -0,0 +1,74 @@ +/** + * \file dnn/src/common/rounding_converter.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "megdnn/dtype.h" + +#if MEGDNN_CC_HOST && !defined(__host__) +#define MEGDNN_HOST_DEVICE_SELF_DEFINE +#define __host__ +#define __device__ +#if __GNUC__ || __has_attribute(always_inline) +#define __forceinline__ inline __attribute__((always_inline)) +#else +#define __forceinline__ inline +#endif +#endif + +namespace megdnn { +namespace rounding { + +template +struct RoundingConverter; + +template <> +struct RoundingConverter { + __host__ __device__ __forceinline__ float operator()(float x) const { + return x; + } +}; + +#ifndef MEGDNN_DISABLE_FLOAT16 + +template <> +struct RoundingConverter { + __host__ __device__ __forceinline__ half_float::half operator()( + float x) const { + return static_cast(x); + } +}; + +#endif // #ifdef MEGDNN_DISABLE_FLOAT16 + +template <> +struct RoundingConverter { + __host__ __device__ __forceinline__ int8_t operator()(float x) const { +#if MEGDNN_CC_HOST + using std::round; +#endif + return static_cast(round(x)); + } +}; + +template <> +struct RoundingConverter { + __host__ __device__ __forceinline__ uint8_t operator()(float x) const { +#if MEGDNN_CC_HOST + using std::round; +#endif + return static_cast(round(x)); + } +}; + +} // namespace rounding +} // namespace megdnn + +/* vim: set ft=cpp: */ diff --git a/dnn/src/common/separableConv.cpp b/dnn/src/common/separableConv.cpp new file mode 100644 index 00000000..a82a5828 --- /dev/null +++ b/dnn/src/common/separableConv.cpp @@ -0,0 +1,93 @@ +/** + * \file dnn/src/common/separableConv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void SeparableConvBase::deduce_layout_fwd(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + TensorLayout &dst) +{ + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + + megdnn_layout_msg(filter_x) + ", " + + megdnn_layout_msg(dst) + ", " + + megdnn_mangle("is_xcorr=") + + megdnn_mangle("borderMode=") + + std::to_string((param().mode == Mode::CROSS_CORRELATION)) + ", " + + std::to_string((int)(param().borderMode)) + ", " + + megdnn_mangle("pad_h=") + std::to_string(param().pad_h) + ", " + + megdnn_mangle("pad_w=") + std::to_string(param().pad_w) + ", " + + megdnn_mangle("stride_h=") + std::to_string(param().stride_h) + ", " + + megdnn_mangle("stride_w=") + std::to_string(param().stride_w); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter_x); + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(filter_x.ndim == 4_z, "%s", errmsg().c_str()); + size_t n = src[0]; + size_t ic = src[1]; + size_t ih = src[2]; + size_t iw = src[3]; + size_t oc = filter_x[0]; + megdnn_assert_eq_layout(filter_x, filter_y); + megdnn_assert(filter_x[1] == ic, "%s", errmsg().c_str()); + size_t fw = filter_x[3]; + size_t fh = fw; + size_t sh = this->param().stride_h; + size_t sw = this->param().stride_w; + size_t ph = this->param().pad_h; + size_t pw = this->param().pad_w; + size_t oh, ow; + infer_conv_shape2d(ih, iw, fh, fw, sh, sw, ph, pw, oh, ow); + dst = TensorLayout(TensorShape({n, oc, oh, ow}), src.dtype); +} + +void SeparableConvBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + const TensorLayout &dst) +{ + TensorLayout dst_expected; + megdnn_assert_eq_dtype(src, filter_x); + megdnn_assert_eq_dtype(src, filter_y); + megdnn_assert_eq_layout(filter_x, filter_y); + megdnn_assert_eq_dtype(src, dst); + deduce_layout_fwd(src, filter_x, filter_y, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); +} + +void SeparableConvForward::deduce_layout(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + TensorLayout &dst) +{ + deduce_layout_fwd(src, filter_x, filter_y, dst); +} + +void SeparableConvForward::check_exec(const TensorLayout &src, + const TensorLayout &filter_x, + const TensorLayout &filter_y, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, filter_x, filter_y, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter_x, filter_y, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/separableFilter.cpp b/dnn/src/common/separableFilter.cpp new file mode 100644 index 00000000..d0c29461 --- /dev/null +++ b/dnn/src/common/separableFilter.cpp @@ -0,0 +1,79 @@ +/** + * \file dnn/src/common/separableFilter.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void SeparableFilterBase::deduce_layout_fwd(const TensorLayout& src, + const TensorLayout& filter_x, + const TensorLayout& filter_y, + TensorLayout& dst) { + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(filter_x) + + ", " + megdnn_layout_msg(dst) + ", " + + megdnn_mangle("borderMode=") + + std::to_string((int)(param().borderMode)) + ", " + + megdnn_mangle("ksize_h=") + std::to_string(param().ksize_h) + + ", " + megdnn_mangle("ksize_w=") + + std::to_string(param().ksize_w) + ", " + + megdnn_mangle("anchor_h=") + std::to_string(param().anchor_h) + + ", " + megdnn_mangle("anchor_w=") + + std::to_string(param().anchor_w); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(filter_x); + megdnn_assert_contiguous(filter_y); + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(param().format == Param::Format::NHWC, + "Only NHWC was supported by now"); + size_t n = src[0]; + size_t ih = src[1]; + size_t iw = src[2]; + size_t ic = src[3]; + dst = TensorLayout(TensorShape({n, ih, iw, ic}), src.dtype); +} + +void SeparableFilterBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& filter_x, + const TensorLayout& filter_y, + const TensorLayout& dst) { + TensorLayout dst_expected; + megdnn_assert_eq_layout(src, dst); + deduce_layout_fwd(src, filter_x, filter_y, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); +} + +void SeparableFilterForward::deduce_layout(const TensorLayout& src, + const TensorLayout& filter_x, + const TensorLayout& filter_y, + TensorLayout& dst) { + deduce_layout_fwd(src, filter_x, filter_y, dst); +} + +void SeparableFilterForward::check_exec(const TensorLayout& src, + const TensorLayout& filter_x, + const TensorLayout& filter_y, + const TensorLayout& dst, + size_t workspace_in_bytes) { + megdnn_assert(param().ksize_h > 0 && (param().ksize_h & 1)); + megdnn_assert(param().ksize_w > 0 && (param().ksize_w & 1)); + check_layout_fwd(src, filter_x, filter_y, dst); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, filter_x, filter_y, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/simd_macro/epilogue.h b/dnn/src/common/simd_macro/epilogue.h new file mode 100644 index 00000000..f6559b5c --- /dev/null +++ b/dnn/src/common/simd_macro/epilogue.h @@ -0,0 +1,60 @@ +/** + * \file dnn/src/common/simd_macro/epilogue.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#undef MEGDNN_SIMD_NAME +#undef MEGDNN_SIMD_TARGET +#undef MEGDNN_SIMD_ATTRIBUTE_TARGET +#undef MEGDNN_SIMD_WIDTH +#undef MEGDNN_SIMD_TYPE +#undef MEGDNN_SIMD_LOADU +#undef MEGDNN_SIMD_STOREU +#undef MEGDNN_SIMD_SETZERO +#undef MEGDNN_SIMD_SET1 +#undef MEGDNN_SIMD_FMADD +#undef MEGDNN_SIMD_MAX + +#ifdef MEGDNN_SIMD_UZP +#undef MEGDNN_SIMD_UZP +#endif + +#ifdef _INSERTPS_NDX +#undef _INSERTPS_NDX +#endif + +#ifdef _M64 +#undef _M64 +#endif + +#ifdef _M64f +#undef _M64f +#endif + +#ifdef _pM128i +#undef _pM128i +#endif + +#ifdef _pM128 +#undef _pM128 +#endif + +#ifdef _M128i +#undef _M128i +#endif + +#ifdef _M128 +#undef _M128 +#endif + +#undef MEGDNN_SIMD_LOAD2 +#undef MEGDNN_SIMD_EXT +#undef MEGDNN_SIMD_MUL +#undef MEGDNN_SIMD_FMA_LANE +#undef MEGDNN_SIMD_VEC +#undef MEGDNN_SIMD_SET_LANE diff --git a/dnn/src/common/small_vector.cpp b/dnn/src/common/small_vector.cpp new file mode 100644 index 00000000..67f564fc --- /dev/null +++ b/dnn/src/common/small_vector.cpp @@ -0,0 +1,43 @@ +/** + * \file dnn/src/common/small_vector.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/thin/small_vector.h" +#include "src/common/utils.h" + +using namespace megdnn; + +void SmallVectorBase::on_invalid_at(size_t idx, size_t size) { + megdnn_throw(ssprintf("invalid vector at(): idx=%zu size=%zu", idx, size)); + MEGDNN_MARK_USED_VAR(idx); + MEGDNN_MARK_USED_VAR(size); +} + +void SmallVectorBase::grow_pod(void* first_elm_ptr, size_t min_sz_in_bytes, + size_t type_size) { + size_t cur_sz_in_bytes = size_in_bytes(); + size_t new_capacity_in_bytes = 2 * capacity_in_bytes() + type_size; + if (new_capacity_in_bytes < min_sz_in_bytes) { + new_capacity_in_bytes = min_sz_in_bytes; + } + void* new_begin; + if (first_elm_ptr == m_begin_ptr) { + new_begin = malloc(new_capacity_in_bytes); + memcpy(new_begin, m_begin_ptr, cur_sz_in_bytes); + } else { + new_begin = realloc(this->m_begin_ptr, new_capacity_in_bytes); + } + this->m_begin_ptr = new_begin; + this->m_end_ptr = static_cast(this->m_begin_ptr) + cur_sz_in_bytes; + this->m_capacity_ptr = + static_cast(this->m_begin_ptr) + new_capacity_in_bytes; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/svd.cpp b/dnn/src/common/svd.cpp new file mode 100644 index 00000000..367374ec --- /dev/null +++ b/dnn/src/common/svd.cpp @@ -0,0 +1,95 @@ +/** + * \file dnn/src/common/svd.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs/linalg.h" + +#include "src/common/utils.h" + +using namespace megdnn; + +void SVD::deduce_layout(const TensorLayout& src, TensorLayout& u, + TensorLayout& s, TensorLayout& vt) { + Param p = param(); + size_t m, n; + canonize_params(src, nullptr, &m, &n); + SmallVector shape_prefix; + for (size_t i = 0; i < src.ndim - 2; i++) { + shape_prefix.push_back(src[i]); + } + SmallVector shape_s(shape_prefix), shape_u, shape_vt; + shape_s.push_back(std::min(m, n)); + if (p.compute_uv) { + shape_u = shape_prefix; + shape_vt = shape_prefix; + + size_t ucols = m; + size_t vrows = n; + if (!p.full_matrices) { + ucols = vrows = std::min(m, n); + } + // let P = min(M, N) + // M x M or M x P + shape_u.push_back(m); + shape_u.push_back(ucols); + + // N x N or P x N + shape_vt.push_back(vrows); + shape_vt.push_back(n); + } else { + shape_u = {0}; + shape_vt = {0}; + } + s = {shape_s, src.dtype}; + u = {shape_u, src.dtype}; + vt = {shape_vt, src.dtype}; +} + +size_t SVD::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& u, const TensorLayout& s, + const TensorLayout& vt) { + MEGDNN_MARK_USED_VAR(u); + MEGDNN_MARK_USED_VAR(s); + MEGDNN_MARK_USED_VAR(vt); + + size_t block_cnt, m, n; + canonize_params(src, &block_cnt, &m, &n); + return get_workspace_in_bytes(block_cnt, m, n, src.dtype.size()); +} + +void SVD::canonize_params(const TensorLayout& layout, size_t* block_cnt, + size_t* m, size_t* n) { + megdnn_assert(layout.is_contiguous() && layout.ndim >= 2, + "invalid SVD layout: %s", layout.to_string().c_str()); + megdnn_assert(layout.dtype == dtype::Float32(), "SVD only supports f32"); + if (block_cnt) { + *block_cnt = 1; + for (size_t i = 0; i < layout.ndim - 2; ++i) { + *block_cnt *= layout[i]; + } + } + if (n) { + *n = layout[layout.ndim - 1]; + } + if (m) { + *m = layout[layout.ndim - 2]; + } +} + +void SVD::check_exec(const TensorLayout& src, const TensorLayout& u, + const TensorLayout& s, const TensorLayout& vt, + size_t workspace_in_bytes) { + size_t m, n; + canonize_params(src, nullptr, &m, &n); + // get_workspace_in_bytes runs the canonize_params, thus runs the check + auto required_workspace_in_bytes = get_workspace_in_bytes(src, u, s, vt); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/tensor_format.cpp b/dnn/src/common/tensor_format.cpp new file mode 100644 index 00000000..71c89838 --- /dev/null +++ b/dnn/src/common/tensor_format.cpp @@ -0,0 +1,435 @@ +/** + * \file dnn/src/common/tensor_format.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/tensor_format.h" +#include "megdnn/basic_types.h" +#include "src/common/utils.h" + +#include + +using namespace megdnn; +using namespace megdnn::detail; + +namespace { +DefaultTensorFormat* default_tensor_format_obj; +} + +/* ===================== TensorFormat ===================== */ + +TensorFormat TensorFormat::deserialize(const std::string& bin, + const Handle* handle) { + using Type = TensorFormat::Type; + auto type = reinterpret_cast(bin.data()); + switch (*type) { + case Type::DEFAULT: + return DefaultTensorFormat::deserialize(handle, type + 1, + bin.size() - sizeof(Type)); + case Type::IMAGE2D_PACK4: + return Image2DPack4TensorFormat::deserialize( + handle, type + 1, bin.size() - sizeof(Type)); + default: + megdnn_throw("invalid tensor format type in deserialize"); + } +} + +TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} + +std::string TensorFormat::to_string() const { + return m_impl->to_string(); +} + +std::string TensorFormat::serialize() const { + std::string ret; + ret.reserve(32); + ret.assign(sizeof(Type), '\0'); + *reinterpret_cast(&ret[0]) = type(); + m_impl->serialize_append(ret); + return ret; +} + +void TensorFormat::on_bad_cvt(Type dst_type) const { + MEGDNN_MARK_USED_VAR(dst_type); + megdnn_throw(ssprintf("can not convert tensor format %s to %d", + impl()->to_string().c_str(), + static_cast(dst_type))); +} + +bool TensorFormat::is_default() const { + return m_impl == default_tensor_format_obj; +} + +/* ===================== DefaultFormat ===================== */ +size_t DefaultTensorFormat::init_contiguous_stride(TensorLayout& layout) const { + if (!layout.ndim) + return 0; + megdnn_assert(layout.ndim <= TensorLayout::MAX_NDIM); + size_t accum = 1; + SafeMultiplies mul; + for (size_t i = layout.ndim; i; --i) { + layout.stride[i - 1] = accum; + accum = mul(accum, layout.shape[i - 1]); + } + return accum; +} + +bool DefaultTensorFormat::is_contiguous_spec(const TensorLayout& layout) const { + return layout.is_physical_contiguous(); +} + +TensorLayout DefaultTensorFormat::collapse_contiguous_spec( + const TensorLayout& layout) const { + megdnn_assert(layout.ndim); + TensorLayout res{layout}; + + // remove all dims with shape 1 + for (int i = static_cast(res.ndim) - 1; i >= 0 && res.ndim >= 2; --i) { + if (!res.shape[i]) { + // empty tensor + res.ndim = 1; + res.shape[0] = 0; + res.stride[0] = 1; + return res; + } + if (res.shape[i] == 1) + res.remove_axis_inplace(i); + } + + if (res.ndim == 1) { + if (res.shape[0] <= 1) { + // make it the "most canonical" contiguous layout for scalars or + // empty tensors + res.stride[0] = 1; + } + return res; + } + + megdnn_assert(res.ndim && res.shape[res.ndim - 1]); + for (int i = static_cast(res.ndim) - 2; i >= 0; --i) { + megdnn_assert(res.shape[i]); + if (res.stride[i] == + res.stride[i + 1] * static_cast(res.shape[i + 1])) { + res.shape[i] *= res.shape[i + 1]; + res.stride[i] = res.stride[i + 1]; + res.remove_axis_inplace(i + 1); + } + } + return res; +} + +TensorLayout::Span DefaultTensorFormat::span_spec( + const TensorLayout& layout) const { + if (layout.ndim == 0) + return {0, 0, 0, 0}; + + ptrdiff_t low_elem = 0; + size_t high_elem = 0; + for (size_t i = 0; i < layout.ndim; ++i) { + auto shape_val = layout.shape[i]; + if (!shape_val) { + return {0, 0, 0, 0}; + } + auto stride_val = layout.stride[i]; + if (stride_val > 0) { + high_elem += (shape_val - 1) * stride_val; + } else { + low_elem += (shape_val - 1) * stride_val; + } + } + ++high_elem; + ptrdiff_t low_byte; + if (low_elem < 0) { + megdnn_assert(!layout.dtype.is_low_bit(), + "tensors with low-bit dytes shouldn't have negative " + "strides"); + low_byte = low_elem * layout.dtype.size(); + } else { + low_byte = 0; + } + size_t high_byte = layout.dtype.size(high_elem); + return TensorLayout::Span(low_elem, low_byte, high_elem, high_byte); +} + +std::string DefaultTensorFormat::to_string() const { + return "default{}"; +} + +void DefaultTensorFormat::serialize_append(std::string&) const {} + +TensorFormat DefaultTensorFormat::deserialize(const Handle* handle, + const void* buf, size_t size) { + MEGDNN_MARK_USED_VAR(handle); + MEGDNN_MARK_USED_VAR(buf); + megdnn_assert(!size); + return make(); +} + +TensorFormat DefaultTensorFormat::make() { + // use static storage so the object is accessible in global destructing + // phase + static std::aligned_storage_t + storage; + static DefaultTensorFormat* obj = default_tensor_format_obj = + new (&storage) DefaultTensorFormat{}; + return impl_to_tensor_format(obj); +} + +/* ===================== Image2DTensorFormatBase ===================== */ + +Image2DTensorFormatBase::Image2DTensorFormatBase(Type type, size_t align_axis, + size_t align_size_in_byte) + : ImplBase(type) { + megdnn_assert(align_size_in_byte && align_axis); + m_align_axis = align_axis; + m_align_size_in_byte_log2 = __builtin_ctz(align_size_in_byte); + megdnn_assert((1u << m_align_size_in_byte_log2) == align_size_in_byte, + "align size not power of 2: %zu", align_size_in_byte); +} + +size_t Image2DTensorFormatBase::init_contiguous_stride( + TensorLayout& layout) const { + if (!layout.ndim) + return 0; + megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis, + "dtype=%s ndim=%zu align=%zu", layout.dtype.name(), + layout.ndim, m_align_axis); + size_t align_size = align_size_in_byte(layout.dtype.size_log()); + size_t accum = 1; + SafeMultiplies mul; + for (size_t i = layout.ndim; i; --i) { + if (i == m_align_axis) { + accum = get_aligned_power2(accum, align_size); + } + + layout.stride[i - 1] = accum; + accum = mul(accum, layout.shape[i - 1]); + } + assert_valid(layout); + return accum; +}; + +bool Image2DTensorFormatBase::is_contiguous_spec( + const TensorLayout& layout) const { + megdnn_assert(layout.dtype.valid()); + size_t align_size = align_size_in_byte(layout.dtype.size_log()); + ptrdiff_t expected = 1; + int height_axis = static_cast(m_align_axis - 1); + for (int i = layout.ndim - 1; i >= 0; --i) { + if (i == height_axis) { + expected = megdnn::get_aligned_power2(expected, align_size); + } + if (layout.shape[i] != 1 && layout.stride[i] != expected) { + if (i == height_axis) { + // allow row pitch to be larger than minimal required + auto s = layout.stride[i]; + if (!s) { + // broadcast is not contiguous + return false; + } + + size_t mask = align_size_in_byte(layout.dtype.size_log()) - 1; + megdnn_assert(s > expected && !(s & mask), + "invalid row pitch: %d; layout: %s", + static_cast(s), layout.to_string().c_str()); + expected = s; + } else { + return false; + } + } + expected *= layout.shape[i]; + } + // empty tensors are not contiguous + return expected != 0; +} + +TensorLayout Image2DTensorFormatBase::collapse_contiguous_spec( + const TensorLayout& layout) const { + assert_valid(layout); + TensorLayout res{layout}; + int new_axis = m_align_axis; + // remove all dims with shape 1 + for (int i = static_cast(res.ndim) - 1; i >= 0 && res.ndim >= 3; --i) { + if (i == new_axis && static_cast(res.ndim) == new_axis + 1) { + // i is the only width dim + continue; + } + if (i == new_axis - 1 && !i) { + // new_xis == 1 && i == 0, i is the only height dim + continue; + } + if (res.shape[i] == 1) { + res.remove_axis_inplace(i); + if (i < new_axis) + new_axis -= 1; + } + } + megdnn_assert(res.ndim >= 2); + + auto contig_with_next = [&](size_t i) { + return res.stride[i] == + res.stride[i + 1] * static_cast(res.shape[i + 1]); + }; + + for (int i = static_cast(res.ndim) - 2; i >= new_axis; --i) { + megdnn_assert(res.shape[i]); + if (contig_with_next(i)) { + // remove next axis + res.shape[i] *= res.shape[i + 1]; + res.stride[i] = res.stride[i + 1]; + res.remove_axis_inplace(i + 1); + } + } + + for (int i = new_axis - 2; i >= 0; --i) { + megdnn_assert(res.shape[i]); + if (contig_with_next(i)) { + res.shape[i] *= res.shape[i + 1]; + res.stride[i] = res.stride[i + 1]; + res.remove_axis_inplace(i + 1); + if (i <= new_axis - 2) + new_axis -= 1; + } + } + res.format = change_axis(new_axis); + return res; +} + +TensorLayout::Span Image2DTensorFormatBase::span_spec( + const TensorLayout& layout) const { + assert_valid(layout); + size_t size = image_height(layout) * image_row_pitch(layout); + auto mask = (1 << layout.dtype.size_log()) - 1; + megdnn_assert(!(size & mask), "unaligned size: %zu", size); + return {0, 0, size >> layout.dtype.size_log(), size}; +} + +void Image2DTensorFormatBase::serialize_append(std::string& result) const { + SerializePack pack; + pack.align_axis = m_align_axis; + megdnn_assert(pack.align_axis == m_align_axis); // detect overflow + result.append(reinterpret_cast(&pack), sizeof(pack)); +} + +size_t Image2DTensorFormatBase::image_height(const TensorLayout& layout) const { + size_t accum = 1; + for (int i = m_align_axis - 1; i >= 0; --i) { + if (layout.stride[i] == 0) { + // this dimension is broadcasted + } else { + accum *= layout.shape[i]; + } + } + return accum; +} + +size_t Image2DTensorFormatBase::image_row_pitch( + const TensorLayout& layout) const { + for (int i = m_align_axis - 1; i >= 0; --i) { + // find a non-broadcast axis + if (auto s = layout.stride[i]) { + return layout.dtype.size(s); + } + } + // use width for all broadcasted case + return get_aligned_power2( + layout.dtype.size(image_width_elems(layout)), + 1 << m_align_size_in_byte_log2); +} + +void Image2DTensorFormatBase::assert_valid(const TensorLayout& layout) const { + megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis); + ptrdiff_t first_non_zero_stride = 0; + for (int i = layout.ndim - 1; i >= 0; --i) { + megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); + if (i < static_cast(m_align_axis) && !first_non_zero_stride) { + first_non_zero_stride = layout.stride[i]; + } + } + size_t mask = align_size_in_byte(layout.dtype.size_log()) - 1; + megdnn_assert(!(first_non_zero_stride & mask), + "first stride is %d, but alignment is %zu", + static_cast(first_non_zero_stride), mask + 1); +} + +size_t Image2DTensorFormatBase::image_width_elems( + const TensorLayout& layout) const { + size_t high_elem = 0; + for (size_t i = m_align_axis; i < layout.ndim; ++i) { + high_elem += (layout.shape[i] - 1) * layout.stride[i]; + } + return high_elem + 1; +} + +std::string Image2DTensorFormatBase::to_string() const { + return ssprintf("I2D{%zu,%d}", m_align_axis, + 1 << m_align_size_in_byte_log2); +} + +/* ===================== Image2DPackedTensorFormatBase ===================== */ + +template +size_t Image2DPackedTensorFormatBase::image_width( + const TensorLayout& layout) const { + auto ret = image_width_elems(layout); + megdnn_assert(ret % PIXEL_SIZE == 0); + return ret / PIXEL_SIZE; +} + +template +void Image2DPackedTensorFormatBase::assert_valid( + const TensorLayout& layout) const { + Image2DTensorFormatBase::assert_valid(layout); + megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), + "bad shape: %zu", layout.shape[layout.ndim - 1]); +} + +namespace megdnn { +namespace detail { +template class Image2DPackedTensorFormatBase<4>; +} // namespace detail +} // namespace megdnn + +/* ===================== Image2DPack4TensorFormat ===================== */ +TensorFormat Image2DPack4TensorFormat::make_raw(size_t align_axis, + size_t align_size_in_byte) { + static std::mutex mtx; + static std::unordered_map> + cache; + megdnn_assert(std::max(align_axis, align_size_in_byte) <= + std::numeric_limits::max()); + MEGDNN_LOCK_GUARD(mtx); + auto&& ptr = cache[(static_cast(align_axis) << 32) | + align_size_in_byte]; + if (!ptr) { + ptr.reset(new Image2DPack4TensorFormat{align_axis, align_size_in_byte}); + } + return impl_to_tensor_format(ptr.get()); +} + +TensorFormat Image2DPack4TensorFormat::make(size_t align_axis, + const Handle* handle) { + return make_raw(align_axis, handle->image2d_pitch_alignment()); +} + +TensorFormat Image2DPack4TensorFormat::deserialize(const Handle* handle, + const void* buf, + size_t size) { + megdnn_assert(size == sizeof(SerializePack)); + auto pack = *static_cast(buf); + return make(pack.align_axis, handle); +} + +TensorFormat Image2DPack4TensorFormat::change_axis(size_t axis) const { + return make_raw(axis, align_size_in_byte()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/tensor_iter.cpp b/dnn/src/common/tensor_iter.cpp new file mode 100644 index 00000000..46ccb57c --- /dev/null +++ b/dnn/src/common/tensor_iter.cpp @@ -0,0 +1,93 @@ +/** + * \file dnn/src/common/tensor_iter.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/tensor_iter.h" +#include "src/common/utils.h" + +using namespace megdnn; + +////////////////////////// TypeRef //////////////////// +TypeRef::TypeRef(dt_quint4* _ptr, size_t _offset) { + ptr = reinterpret_cast(_ptr); + offset = _offset; + uint8_t cur = ptr[offset >> 1]; + val = convert(cur, dt_quint4(cur), offset & 0x1) + .as_uint8(); + +} + +void TypeRef::operator=(const uint8_t _) { + uint8_t cur = ptr[offset >> 1]; + ptr[offset >> 1] = + convert(dt_quint4(_), cur, offset & 0x1); +} + +TypeRef::TypeRef(dt_qint4* _ptr, size_t _offset) { + ptr = reinterpret_cast(_ptr); + offset = _offset; + int8_t cur = ptr[offset >> 1]; + val = convert(cur, dt_qint4(cur), offset & 0x1).as_int8(); +} + +void TypeRef::operator=(const int8_t _) { + int8_t cur = ptr[offset >> 1]; + ptr[offset >> 1] = + convert(dt_qint4(_), cur, offset & 0x1); +} + +////////////////////// TensorIter ///////////////////// + +template +typename TensorIter::Iter +TensorIter::Iter::make( + ctype *ptr, const TensorLayout &layout, size_t offset) { + megdnn_assert(layout.ndim); + Iter rst; + rst.m_ptr = ptr; + if (valonly) + rst.m_layout = layout.collapse_contiguous(); + else + rst.m_layout = layout; + rst.m_logical_offset = offset; + rst.m_tot_nr_elems = rst.m_layout.total_nr_elems(); + rst.m_offset = 0; + megdnn_assert(offset <= rst.m_tot_nr_elems); + for (int i = rst.m_layout.ndim - 1; i >= 0; i --) { + auto shp = rst.m_layout.shape[i]; + auto stride = rst.m_layout.stride[i]; + if (!shp) { + // empty iter for empty layout + return {}; + } + rst.m_axis_reset_stride[i] = stride * (shp - 1); + rst.m_axis_offset[i] = offset % shp; + rst.m_offset += rst.m_axis_offset[i] * stride; + offset /= shp; + } + return rst; +} + +template +void TensorIter::Iter::on_access_idx_valonly_true() const { + megdnn_throw("can not access idx of TensorIter if valonly is true"); +} + +namespace megdnn { +#define cb(_dt) \ + template class TensorIter::ctype, false>; \ + template class TensorIter::ctype, true>; + + MEGDNN_FOREACH_DTYPE_NAME(cb) + MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) +#undef cb +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/common/tensor_remap.cpp b/dnn/src/common/tensor_remap.cpp new file mode 100644 index 00000000..903bbd79 --- /dev/null +++ b/dnn/src/common/tensor_remap.cpp @@ -0,0 +1,71 @@ +/** + * \file dnn/src/common/tensor_remap.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void IndexingRemapBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &map, + const TensorLayout &dst) +{ + megdnn_assert_non_overlapping_strong(src); + megdnn_assert_contiguous(map); + megdnn_assert_non_overlapping_strong(dst); + auto errmsg = megdnn_layout_msg(src) + ", " + + megdnn_layout_msg(map) + ", " + + megdnn_layout_msg(dst); + auto errmsg_c = errmsg.c_str(); + MEGDNN_MARK_USED_VAR(errmsg_c); + megdnn_assert(map.ndim == dst.ndim + 1, "%s", errmsg_c); + for (size_t i = 0_z; i < dst.ndim; ++i) { + megdnn_assert(map.shape[i] == dst.shape[i], "%s", errmsg_c); + } + megdnn_assert(map.shape[dst.ndim] == src.ndim, "%s", errmsg_c); + + megdnn_assert(src.dtype == dtype::Float32()); + megdnn_assert(map.dtype == dtype::Int32()); + megdnn_assert(dst.dtype == dtype::Float32()); +} + +void IndexingRemapForward::deduce_layout(const TensorLayout &src, + const TensorLayout &map, + TensorLayout &dst) +{ + dst = map; + dst.dtype = src.dtype; + --dst.ndim; + dst.init_contiguous_stride(); +} + +void IndexingRemapForward::check_exec(const TensorLayout &src, + const TensorLayout &map, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, map, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, map, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void IndexingRemapBackward::check_exec(const TensorLayout &diff, + const TensorLayout &map, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(grad, map, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(diff, map, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/tile_repeat.cpp b/dnn/src/common/tile_repeat.cpp new file mode 100644 index 00000000..7eeafa26 --- /dev/null +++ b/dnn/src/common/tile_repeat.cpp @@ -0,0 +1,191 @@ +/** + * \file dnn/src/common/tile_repeat.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +#include + +namespace megdnn { + +void TileRepeatBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &dst) +{ + auto errmsg = megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst) + + ", " + "times=" + param().times.to_string(); + auto errmsg_c = errmsg.c_str(); + MEGDNN_MARK_USED_VAR(errmsg_c); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(dst); + auto expected_ndim = param().times.ndim; + megdnn_assert(expected_ndim == src.ndim, "%s", errmsg_c); + megdnn_assert(expected_ndim == dst.ndim, "%s", errmsg_c); + rep(i, expected_ndim) { + megdnn_assert(dst.shape[i] == param().times[i] * src.shape[i], + "%s", errmsg_c); + } + + megdnn_assert(src.dtype == dst.dtype); +} + +void TileRepeatBase::deduce_layout_fwd(const TensorLayout &src, + TensorLayout &dst) +{ + dst.ndim = src.ndim; + rep(i, src.ndim) { + dst.shape[i] = src.shape[i] * param().times[i]; + } + dst.dtype = src.dtype; + dst.init_contiguous_stride(); + check_layout_fwd(src, dst); +} + +size_t TileRepeatBase::get_workspace_in_bytes_fwd(const TensorShape & /* src */, + const TensorShape &dst, + const TensorShape ×, + DType dtype) +{ + size_t nr_workspace = 0; + auto nr_reduces = count_not_ones_in_shape(times); + if (nr_reduces == 0) { + // case 1: no tile/repeat is needed, let alone workspace. + nr_workspace = 0; + } else if (nr_reduces == 1) { + // case 2: only one tile/repeat is needed, so we don't need workspace. + nr_workspace = 0; + } else if (nr_reduces == 2) { + // case 3: two tile/repeats are needed, so we need a single workspace. + nr_workspace = 1; + } else { + // case 4: multiple tile/repeats are needed, so we need two workspace in + // an alternate fashion. + nr_workspace = 2; + } + if (nr_workspace == 0) { + return 0; + } else { + WorkspaceBundle workspaces{ + nullptr, {nr_workspace, dst.total_nr_elems() * dtype.size()}}; + return workspaces.total_size_in_bytes(); + } +} + +void TileBase::simplify_shape(const TensorShape &src, + const TensorShape &dst, + const TensorShape ×, + TensorShape &src2, + TensorShape &dst2, + TensorShape ×2) +{ + size_t n = 0; + for (size_t i = 0; i < src.ndim; ++i) { + if (times.shape[i] == 1 && n > 0) { + src2.shape[n-1] *= src.shape[i]; + dst2.shape[n-1] *= dst.shape[i]; + } else { + src2.shape[n] = src.shape[i]; + dst2.shape[n] = dst.shape[i]; + times2.shape[n] = times.shape[i]; + ++n; + } + } + src2.ndim = dst2.ndim = times2.ndim = n; +} + +size_t TileBase::get_workspace_in_bytes_fwd(const TensorLayout &src_, + const TensorLayout &dst_) +{ + TensorShape src, dst, times; + simplify_shape(src_, dst_, param().times, src, dst, times); + return TileRepeatBase::get_workspace_in_bytes_fwd(src, dst, times, + src_.dtype); +} + +void TileForward::deduce_layout(const TensorLayout &src, + TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void TileForward::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void TileBackward::check_exec(const TensorLayout &diff, const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void RepeatBase::simplify_shape(const TensorShape &src, + const TensorShape & /* dst */, + const TensorShape ×, + TensorShape &src2, + TensorShape &dst2, + TensorShape ×2) +{ + auto n = 0u; + size_t i = 0; + while (i < times.ndim) { + size_t j = i; + while (j < times.ndim && times.shape[j] == 1) ++j; + // Here: j is times.ndim, or times.shape[j] != 1 + if (j < times.ndim) ++j; + src2.shape[n] = std::accumulate(src.shape + i, src.shape + j, + 1_z, SafeMultiplies()); + times2.shape[n] = times.shape[j-1]; + dst2.shape[n] = src2.shape[n] * times2.shape[n]; + ++n; + i = j; + } + src2.ndim = dst2.ndim = times2.ndim = n; +} + +size_t RepeatBase::get_workspace_in_bytes_fwd(const TensorLayout &src_, + const TensorLayout &dst_) +{ + TensorShape src, dst, times; + simplify_shape(src_, dst_, param().times, src, dst, times); + return TileRepeatBase::get_workspace_in_bytes_fwd(src, dst, times, + src_.dtype); +} + +void RepeatForward::deduce_layout(const TensorLayout &src, + TensorLayout &dst) +{ + deduce_layout_fwd(src, dst); +} + +void RepeatForward::check_exec(const TensorLayout &src, const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void RepeatBackward::check_exec(const TensorLayout &diff, + const TensorLayout &grad, size_t workspace_in_bytes) +{ + check_layout_fwd(grad, diff); + auto required_workspace_in_bytes = get_workspace_in_bytes(diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/tile_repeat_helper.cpp b/dnn/src/common/tile_repeat_helper.cpp new file mode 100644 index 00000000..62c7d366 --- /dev/null +++ b/dnn/src/common/tile_repeat_helper.cpp @@ -0,0 +1,101 @@ +/** + * \file dnn/src/common/tile_repeat_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/common/tile_repeat_helper.h" + +#include "src/common/utils.h" +#include + +namespace megdnn { + +// Tile (m, n) to (m, n*times) or Repeat (m, n) to (m*times, n) +template +void tile_or_repeat_single_axis(const T * __restrict src, + T * __restrict dst, + const size_t m, const size_t n, const size_t times) +{ + rep(i, m) { + // copy Ts of length n to dst + std::memcpy(dst, src, sizeof(T) * n); + size_t k = 1u; + while (k*2 <= times) { + std::memcpy(dst + k*n, dst, sizeof(T) * (k*n)); + k *= 2; + } + if (k < times) { + std::memcpy(dst + k*n, dst, sizeof(T) * (times-k) * n); + } + src += n; + dst += n*times; + } +} + +template +void init_tile_repeat_state(const T *src, T *dst, + T *workspace0, T * /* workspace1 */, + T *¤t, T *&next, size_t &state, + size_t nr_reduces) +{ + current = const_cast(src); + if (nr_reduces == 1) { + next = dst; + } else { + next = workspace0; + } + state = 0; +} + +template +void update_tile_repeat_state(const T * /* src */, T *dst, + T *workspace0, T *workspace1, + T *¤t, T *&next, size_t &state, + size_t nr_reduces) +{ + current = next; + if (nr_reduces == 1) { + next = nullptr; + } else if (nr_reduces == 2) { + if (state == 0) { + next = dst; + } else { + next = nullptr; + } + } else { + if (state == 0) { + next = workspace1; + } else if (state + 1 == nr_reduces) { + next = nullptr; + } else if (state + 2 == nr_reduces) { + next = dst; + } else { + megdnn_assert(current == workspace0 || current == workspace1, + "Impossible happened; internal bug."); + next = (current == workspace0 ? workspace1 : workspace0); + } + } + ++state; +} + +#define INST(T) \ +template void tile_or_repeat_single_axis(const T *, T *, \ + const size_t, const size_t, const size_t); \ +template void init_tile_repeat_state(const T *, T *, T *, T *, T *&, T *&, \ + size_t &, size_t); \ +template void update_tile_repeat_state(const T *, T *, T *, T *, T *&, T *&, \ + size_t &, size_t); + +#define INST_DT(d) INST(DTypeTrait::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE(INST_DT) + +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/tile_repeat_helper.h b/dnn/src/common/tile_repeat_helper.h new file mode 100644 index 00000000..834c8617 --- /dev/null +++ b/dnn/src/common/tile_repeat_helper.h @@ -0,0 +1,36 @@ +/** + * \file dnn/src/common/tile_repeat_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include + +namespace megdnn { + +// Tile (m, n) to (m, n*times) or Repeat (m, n) to (m*times, n) +template +void tile_or_repeat_single_axis(const T * __restrict src, + T * __restrict dst, + const size_t m, const size_t n, const size_t times); +// forward and backward can share the same init/update functions. +template +void init_tile_repeat_state(const T *src, T *dst, + T *workspace0, T *workspace1, + T *¤t, T *&next, size_t &state, + size_t nr_reduces); +template +void update_tile_repeat_state(const T *src, T *dst, + T *workspace0, T *workspace1, + T *¤t, T *&next, size_t &state, + size_t nr_reduces); + +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/topk.cpp b/dnn/src/common/topk.cpp new file mode 100644 index 00000000..40363822 --- /dev/null +++ b/dnn/src/common/topk.cpp @@ -0,0 +1,68 @@ +/** + * \file dnn/src/common/topk.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" + +#include "src/common/utils.h" + +#include + +using namespace megdnn; + +void TopK::deduce_layout(int k, const TensorLayout& data, TensorLayout& values, + TensorLayout& indices) { + megdnn_assert(k && data.ndim == 2 && data.stride[1] == 1, + "invalid k=%d data=%s", k, data.to_string().c_str()); + values.dtype = data.dtype; + indices.dtype = dtype::Int32{}; + switch (param().mode) { + case Param::Mode::KTH_ONLY: + values.init_contiguous_stride({data[0]}); + indices.ndim = 0; + break; + case Param::Mode::VALUE_IDX_NOSORT: + case Param::Mode::VALUE_IDX_SORTED: + values.init_contiguous_stride( + {data[0], std::min(std::abs(k), data.shape[1])}); + indices.init_contiguous_stride(values); + break; + default: + megdnn_throw("invalid TopK mode"); + } +} + +void TopK::exec(int k, _megdnn_tensor_in data, _megdnn_tensor_out values, + _megdnn_tensor_out indices, _megdnn_workspace workspace) { + TensorLayout oval, oidx; + deduce_layout(k, data.layout, oval, oidx); + megdnn_assert_eq_layout(oval, values.layout); + int32_t* iptr = nullptr; + if (param().mode == Param::Mode::KTH_ONLY) { + megdnn_assert_eq_shape(indices.layout, TensorShape{}); + } else { + iptr = indices.ptr(); + megdnn_assert_eq_layout(oidx, indices.layout); + } + megdnn_assert(workspace.size >= get_workspace_in_bytes(k, data.layout, + values.layout, + indices.layout)); + if (static_cast(std::abs(k)) > data.layout[1]) { + if (k > 0) { + k = data.layout[1]; + } else { + k = -static_cast(data.layout[1]); + } + } + do_exec(k, data, values, iptr, workspace); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/common/transpose.cpp b/dnn/src/common/transpose.cpp new file mode 100644 index 00000000..dd25782b --- /dev/null +++ b/dnn/src/common/transpose.cpp @@ -0,0 +1,51 @@ +/** + * \file dnn/src/common/transpose.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void TransposeForward::deduce_layout(const TensorLayout &src, TensorLayout &dst) +{ + dst = src; + dst.dtype = src.dtype; + std::swap(dst.shape[0], dst.shape[1]); + dst.init_contiguous_stride(); +} + +void TransposeForward::check_exec(const TensorLayout &src, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + // dtype must collide + megdnn_assert(src.dtype == dst.dtype); + // ndim must be 2 + megdnn_assert(src.ndim == 2); + megdnn_assert(dst.ndim == 2); + // shapes are swapped + megdnn_assert(src.shape[0] == dst.shape[1]); + megdnn_assert(src.shape[1] == dst.shape[0]); + // last dimension stride must be 1 + megdnn_assert(src.stride[1] == 1); + megdnn_assert(dst.stride[1] == 1); + // leading dimension stride must be geq last dimension shape + megdnn_assert(src.stride[0] > 0); + megdnn_assert(dst.stride[0] > 0); + megdnn_assert(static_cast(src.stride[0]) >= src.shape[1]); + megdnn_assert(static_cast(dst.stride[0]) >= dst.shape[1]); + + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/type_cvt.cpp b/dnn/src/common/type_cvt.cpp new file mode 100644 index 00000000..885a81b5 --- /dev/null +++ b/dnn/src/common/type_cvt.cpp @@ -0,0 +1,30 @@ +/** + * \file dnn/src/common/type_cvt.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void TypeCvt::check_exec(const TensorLayout &src, const TensorLayout &dst) { + megdnn_assert_contiguous(dst); + megdnn_assert_eq_shape(src, dst); + auto cat = src.dtype.category(); + megdnn_assert(cat == DTypeCategory::FLOAT || cat == DTypeCategory::INT || + cat == DTypeCategory::QUANTIZED); + cat = dst.dtype.category(); + megdnn_assert(cat == DTypeCategory::FLOAT || cat == DTypeCategory::INT || + cat == DTypeCategory::QUANTIZED); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/unroll_macro.h b/dnn/src/common/unroll_macro.h new file mode 100644 index 00000000..936286fe --- /dev/null +++ b/dnn/src/common/unroll_macro.h @@ -0,0 +1,124 @@ +/** + * \file dnn/src/common/unroll_macro.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#define UNROLL_RAW1(cb, v0, a...) cb(0, ##a) +#define UNROLL_RAW2(cb, v0, a...) cb(0, ##a) cb(1, ##a) +#define UNROLL_RAW3(cb, v0, a...) UNROLL_RAW2(cb, v0, ##a) cb(2, ##a) +#define UNROLL_RAW4(cb, v0, a...) \ + UNROLL_RAW2(cb, v0, ##a) \ + cb(2, ##a) cb(3, ##a) +#define UNROLL_RAW5(cb, v0, a...) \ + UNROLL_RAW4(cb, v0, ##a) \ + cb(4, ##a) +#define UNROLL_RAW6(cb, v0, a...) \ + UNROLL_RAW4(cb, v0, ##a) \ + cb(4, ##a) cb(5, ##a) +#define UNROLL_RAW7(cb, v0, a...) \ + UNROLL_RAW4(cb, v0, ##a) \ + cb(4, ##a) cb(5, ##a) cb(6, ##a) +#define UNROLL_RAW8(cb, v0, a...) \ + UNROLL_RAW4(cb, v0, ##a) \ + cb(4, ##a) cb(5, ##a) cb(6, ##a) cb(7, ##a) +#define UNROLL_RAW9(cb, v0, a...) \ + UNROLL_RAW8(cb, v0, ##a) \ + cb(8, ##a) +#define UNROLL_RAW16(cb, v0, a...) \ + UNROLL_RAW8(cb, v0, ##a) \ + cb(8, ##a) cb(9, ##a) cb(10, ##a) cb(11, ##a) cb(12, ##a) cb(13, ##a) \ + cb(14, ##a) cb(15, ##a) +#define UNROLL_RAW24(cb, v0, a...) \ + UNROLL_RAW16(cb, v0, ##a) \ + cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \ + cb(22, ##a) cb(23, ##a) + +#define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v) +#define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v) +#define UNROLL_CALL(step, cb, v...) \ + do { \ + UNROLL_CALL1(step, cb, ##v); \ + } while (0) + +#define UNROLL_CALL_RAW(step, cb, v...) UNROLL_CALL1(step, cb, ##v); +#define UNROLL_CALL_NOWRAPPER(step, cb) UNROLL_CALL_RAW(step, cb) + +#define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v) +#define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v) +#define UNROLL_CALL(step, cb, v...) \ + do { \ + UNROLL_CALL1(step, cb, ##v); \ + } while (0) + + +///////////////////// unroll with 2 dimension ////////////////////// +#define UNROLL_RAW_1x1(cb, v0, a...) cb(0, 0, ##a) +#define UNROLL_RAW_2x2(cb, v0, a...) \ + cb(0, 0, ##a) cb(0, 1, ##a) cb(1, 0, ##a) cb(1, 1, ##a) + +#define UNROLL_RAW_3x3(cb, v0, a...) \ + cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) \ + cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) \ + cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) \ + +#define UNROLL_RAW_4x4(cb, v0, a...) \ + cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) cb(0, 3, ##a) \ + cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) cb(1, 3, ##a) \ + cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) cb(2, 3, ##a) \ + cb(3, 0, ##a) cb(3, 1, ##a) cb(3, 2, ##a) cb(3, 3, ##a) + +#define UNROLL_RAW_6x6(cb, v0, a...) \ + cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) cb(0, 3, ##a) \ + cb(0, 4, ##a) cb(0, 5, ##a) \ + cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) cb(1, 3, ##a) \ + cb(1, 4, ##a) cb(1, 5, ##a) \ + cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) cb(2, 3, ##a) \ + cb(2, 4, ##a) cb(2, 5, ##a) \ + cb(3, 0, ##a) cb(3, 1, ##a) cb(3, 2, ##a) cb(3, 3, ##a) \ + cb(3, 4, ##a) cb(3, 5, ##a) \ + cb(4, 0, ##a) cb(4, 1, ##a) cb(4, 2, ##a) cb(4, 3, ##a) \ + cb(4, 4, ##a) cb(4, 5, ##a) \ + cb(5, 0, ##a) cb(5, 1, ##a) cb(5, 2, ##a) cb(5, 3, ##a) \ + cb(5, 4, ##a) cb(5, 5, ##a) \ + +#define UNROLL_RAW_8x8(cb, v0, a...) \ + cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) cb(0, 3, ##a) \ + cb(0, 4, ##a) cb(0, 5, ##a) cb(0, 6, ##a) cb(0, 7, ##a) \ + cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) cb(1, 3, ##a) \ + cb(1, 4, ##a) cb(1, 5, ##a) cb(1, 6, ##a) cb(1, 7, ##a) \ + cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) cb(2, 3, ##a) \ + cb(2, 4, ##a) cb(2, 5, ##a) cb(2, 6, ##a) cb(2, 7, ##a) \ + cb(3, 0, ##a) cb(3, 1, ##a) cb(3, 2, ##a) cb(3, 3, ##a) \ + cb(3, 4, ##a) cb(3, 5, ##a) cb(3, 6, ##a) cb(3, 7, ##a) \ + cb(4, 0, ##a) cb(4, 1, ##a) cb(4, 2, ##a) cb(4, 3, ##a) \ + cb(4, 4, ##a) cb(4, 5, ##a) cb(4, 6, ##a) cb(4, 7, ##a) \ + cb(5, 0, ##a) cb(5, 1, ##a) cb(5, 2, ##a) cb(5, 3, ##a) \ + cb(5, 4, ##a) cb(5, 5, ##a) cb(5, 6, ##a) cb(5, 7, ##a) \ + cb(6, 0, ##a) cb(6, 1, ##a) cb(6, 2, ##a) cb(6, 3, ##a) \ + cb(6, 4, ##a) cb(6, 5, ##a) cb(6, 6, ##a) cb(6, 7, ##a) \ + cb(7, 0, ##a) cb(7, 1, ##a) cb(7, 2, ##a) cb(7, 3, ##a) \ + cb(7, 4, ##a) cb(7, 5, ##a) cb(7, 6, ##a) cb(7, 7, ##a) + +#define UNROLL_CALL0_D2(step, step2, cb, v...) \ + UNROLL_RAW_##step##x##step2(cb, 0, ##v) +#define UNROLL_CALL1_D2(step, step2, cb, v...) \ + UNROLL_CALL0_D2(step, step2, cb, ##v) +#define UNROLL_CALL_D2(step, step2, cb, v...) \ + do { \ + UNROLL_CALL1_D2(step, step2, cb, ##v); \ + } while (0) + +#define UNROLL_CALL_RAW_D2(step, step2, cb, v...) \ + UNROLL_CALL1_D2(step, step2, cb, ##v); +#define UNROLL_CALL_NOWRAPPER_D2(step, step2, cb) \ + UNROLL_CALL_RAW_D2(step, step2, cb) + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/utils.cpp b/dnn/src/common/utils.cpp new file mode 100644 index 00000000..371afbb5 --- /dev/null +++ b/dnn/src/common/utils.cpp @@ -0,0 +1,316 @@ +/** + * \file dnn/src/common/utils.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" +#include "megdnn/handle.h" + +#include +#include +#include +#include + +using namespace megdnn; + +namespace { +std::string svsprintf(const char* fmt, va_list ap_orig) { + int size = 100; /* Guess we need no more than 100 bytes */ + char* p; + + if ((p = (char*)malloc(size)) == nullptr) + return "svsprintf: malloc failed"; + + for (;;) { + va_list ap; + va_copy(ap, ap_orig); + int n = vsnprintf(p, size, fmt, ap); + va_end(ap); + + if (n < 0) + return "svsprintf: vsnprintf failed"; + + if (n < size) { + std::string rst(p); + free(p); + return rst; + } + + size = n + 1; + + char* np = (char*)realloc(p, size); + if (!np) { + free(p); + return "svsprintf: realloc failed"; + } else + p = np; + } +} +} // anonymous namespace + +std::string megdnn::ssprintf(const char* fmt, ...) { + va_list ap; + va_start(ap, fmt); + auto rst = svsprintf(fmt, ap); + va_end(ap); + return rst; +} + +void megdnn::__assert_fail__(const char* file, int line, const char* func, + const char* expr, const char* msg_fmt, ...) { + std::string msg; + if (msg_fmt) { + va_list ap; + va_start(ap, msg_fmt); + msg = "\nextra message: "; + msg.append(svsprintf(msg_fmt, ap)); + va_end(ap); + } + msg = ssprintf("assertion `%s' failed at %s:%d: %s%s", expr, file, line, + func, msg.c_str()); + megdnn_throw(msg.c_str()); +} + +bool megdnn::get_next_addr(size_t* idx, const size_t* shp, size_t n, + size_t stride) { + auto errmsg = [&]() { + std::string res; + res.append(megdnn_mangle("idx={")); + for (size_t i = 0; i < n; ++i) { + res.append(std::to_string(idx[i])); + if (i + 1 < n) + res.append(megdnn_mangle(",")); + } + res.append(megdnn_mangle("}, shp={")); + for (size_t i = 0; i < n; ++i) { + res.append(std::to_string(shp[i])); + if (i + 1 < n) + res.append(megdnn_mangle(",")); + } + res.append(megdnn_mangle("}, n=")); + res.append(std::to_string(n)); + res.append(megdnn_mangle(", stride=")); + res.append(std::to_string(stride)); + return res; + }; + MEGDNN_MARK_USED_VAR(errmsg); + for (size_t i = 0; i < n; ++i) { + megdnn_assert(idx[i] < shp[i], "%s", errmsg().c_str()); + } + idx[n - 1] += stride; + megdnn_assert(idx[n - 1] <= shp[n - 1], "%s", errmsg().c_str()); + size_t i; + for (i = n; i > 1; --i) + if (idx[i - 1] == shp[i - 1]) { + idx[i - 1] = 0; + ++idx[i - 2]; + } else { + break; + } + if (i == 1 && idx[0] == shp[0]) { + idx[0] = 0; + return false; + } + return true; +} + +int megdnn::get_linear_addr_noncont(size_t* index, const TensorLayout& layout) { + int ans = 0; + rep(i, layout.ndim) { ans += index[i] * layout.stride[i]; } + return ans; +} + +size_t megdnn::get_linear_addr(size_t* index, const size_t* shape, size_t n) { + size_t base = 1; + size_t ans = 0; + for (size_t i = n; i > 0; --i) { + ans += index[i - 1] * base; + base *= shape[i - 1]; + } + return ans; +} + +size_t megdnn::infer_conv_shape(size_t inp, size_t flt, size_t stride, + size_t pad, bool is_floor) { + megdnn_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, + pad, flt); + if (is_floor) { + return (inp + 2 * pad - flt) / stride + 1; + } + return (inp + 2 * pad - flt + stride - 1) / stride + 1; +} + +void megdnn::infer_conv_shape2d(size_t ih, size_t iw, size_t fh, size_t fw, + size_t sh, size_t sw, size_t ph, size_t pw, + size_t& oh, size_t& ow, bool is_floor) { + oh = infer_conv_shape(ih, fh, sh, ph, is_floor); + ow = infer_conv_shape(iw, fw, sw, pw, is_floor); +} + +WorkspaceBundle::WorkspaceBundle(void* ptr, SmallVector sizes_in_bytes, + size_t align_in_bytes) + : m_ptr(ptr), + m_sizes(std::move(sizes_in_bytes)), + m_align_in_bytes(align_in_bytes) { + m_aligned_sizes.reserve(m_sizes.size()); + for (auto size : m_sizes) { + auto aligned_size = size; + if (size % m_align_in_bytes != 0) { + aligned_size += m_align_in_bytes - size % m_align_in_bytes; + } + m_aligned_sizes.push_back(aligned_size); + } +} + +void* WorkspaceBundle::ptr() const { + return m_ptr; +} + +void* WorkspaceBundle::get(size_t i) const { + auto addr = reinterpret_cast(m_ptr); + if (addr % m_align_in_bytes != 0) + addr += m_align_in_bytes - addr % m_align_in_bytes; + for (size_t j = 0; j < i; ++j) { + addr += m_aligned_sizes[j]; + } + return reinterpret_cast(addr); +} + +size_t WorkspaceBundle::nr_workspace() const { + return m_sizes.size(); +} + +size_t WorkspaceBundle::get_size(size_t i) const { + return m_sizes[i]; +} + +void WorkspaceBundle::set(void* ptr) { + m_ptr = ptr; +} + +size_t WorkspaceBundle::total_size_in_bytes() const { + //! return 0 if the WorkspaceBundle is empty + size_t size = + std::accumulate(m_aligned_sizes.begin(), m_aligned_sizes.end(), + static_cast(0)); + return size ? size + m_align_in_bytes : size; +} + +size_t megdnn::count_not_ones_in_shape(const TensorShape& shape) { + size_t res = 0u; + for (size_t i = 0; i < shape.ndim; ++i) + res += (shape[i] != 1u); + return res; +} + +bool megdnn::is_nhwc_contig_wc(const TensorLayout& layout) { + return layout.ndim == 4 && + (layout.stride[3] == 1 || layout.shape[3] == 1) && + (layout.stride[2] == static_cast(layout.shape[3]) || + layout.shape[2] == 1); +} + +megcoreDeviceHandle_t megdnn::get_device_handle(Handle* handle) { + megcoreStatus_t status; + megcoreDeviceHandle_t dev_handle; + megcoreComputingHandle_t comp_handle = handle->megcore_computing_handle(); + status = megcoreGetDeviceHandle(comp_handle, &dev_handle); + megdnn_assert(status == megcoreSuccess); + return dev_handle; +} + +// clang-format off +float megdnn::mul_scale(DType lhs, DType rhs) { +#define cb_binary(dt1, dt2) \ + if ((lhs.enumv() == DTypeTrait::enumv) && \ + (rhs.enumv() == DTypeTrait::enumv)) \ + return lhs.param().scale * rhs.param().scale; + cb_binary(::megdnn::dtype::QuantizedS8, ::megdnn::dtype::QuantizedS16) +#undef cb_binary + + megdnn_assert(lhs.enumv() == rhs.enumv()); +#define cb(dt) \ + if (lhs.enumv() == DTypeTrait
::enumv) \ + return lhs.param
().scale * rhs.param
().scale; + MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) + MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb) +#undef cb + megdnn_assert_internal(0); +} +// clang-format on + +template <> +uint8_t megdnn::convert(dt_quint4 src, uint8_t dst, + size_t offset) { + uint8_t _src = + std::min(src.as_uint8(), DTypeTrait::max()); + if (offset == 0) { + _src &= 0xF; + dst &= 0xF0; + dst |= _src; + } else { + _src <<= 4; + dst &= 0xF; + dst |= _src; + } + return dst; +} + +template <> +dt_quint4 megdnn::convert(uint8_t src, dt_quint4 dst, + size_t offset) { + src >>= (offset << 2); + src &= 0xF; + dst = dt_quint4(src); + return dst; +} + +template <> +int8_t megdnn::convert(dt_qint4 src, int8_t dst, + size_t offset) { + int8_t _src = std::max( + std::min(src.as_int8(), DTypeTrait::max()), + DTypeTrait::min()); + if (offset == 0) { + _src &= 0xF; + dst &= 0xF0; + dst |= _src; + } else { + _src <<= 4; + dst &= 0xF; + dst |= _src; + } + return dst; +} + +template <> +dt_qint4 megdnn::convert(int8_t src, dt_qint4 dst, + size_t offset) { + src <<= (4 - (offset << 2)); + src >>= 4; + dst = dt_qint4(src); + return dst; +} + +/* ======================== CpuNDRange ======================== */ +std::string CpuNDRange::to_string() const { + std::string ret; + for (size_t i = 0; i < m_dimension; i++) { + ret += megdnn::ssprintf(" %zu", m_dim[i]); + } + return ret; +} + +size_t& CpuNDRange::operator[](size_t idx) { + megdnn_assert(idx < m_dimension, "invalid index: %zu expected < %zu", idx, + m_dimension); + return m_dim[idx]; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/utils.cuh b/dnn/src/common/utils.cuh new file mode 100644 index 00000000..d4cee62a --- /dev/null +++ b/dnn/src/common/utils.cuh @@ -0,0 +1,86 @@ +/** + * \file dnn/src/common/utils.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/arch.h" + +//! a comma to be used in macro for template params +#define MEGDNN_COMMA , +#define MEGDNN_MARK_USED_VAR(v) static_cast(v) + +#if MEGDNN_ENABLE_MANGLING +#define megdnn_mangle(x) ("") +#else +#define megdnn_mangle(x) (x) +#endif // MEGDNN_ENABLE_MANGLING + +#define megdnn_throw(msg) ::megdnn::ErrorHandler::on_megdnn_error( \ + megdnn_mangle(msg)) +#define megdnn_throw_if(cond, err_type, msg) do { \ + if (megdnn_unlikely(cond)) { \ + ::megdnn::ErrorHandler::on_##err_type(megdnn_mangle(msg)); \ + } \ +} while(0) + +//! megdnn_assert +#if MEGDNN_ENABLE_MANGLING +#define megdnn_assert(expr, ...) \ + do { \ + if (megdnn_unlikely(!(expr))) { \ + ::megdnn::__assert_fail__(NULL, 0, NULL, NULL, NULL); \ + } \ + } while (0) +#else +#define megdnn_assert(expr, ...) \ + do { \ + if (megdnn_unlikely(!(expr))) { \ + ::megdnn::__assert_fail__(__FILE__, __LINE__, \ + __PRETTY_FUNCTION__, # expr, ## __VA_ARGS__); \ + } \ + } while (0) +#endif // MEGDNN_ENABLE_MANGLING + +#define megdnn_assert_internal(expr) \ + do { \ + megdnn_assert(expr, "Impossible: internal error."); \ + } while (0) + +#define megdnn_ignore(x) (void)(x) + +namespace megdnn { + +void __assert_fail__(const char *file, int line, const char *func, + const char *expr, const char *msg_fmt = nullptr, ...) +#if defined(__GNUC__) || defined(__clang__) + __attribute__((format(printf, 5, 6), noreturn)) +#endif + ; + +void __dummy_printf__(const char *msg_fmt, ...) +#ifdef __GNUC__ + __attribute__((format(printf, 1, 2))) +#endif +; + +//! typetrait, just the same as std::is_same in c++11 +template +struct is_same { + static const bool value = false; +}; + +template +struct is_same { + static const bool value = true; +}; + +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/common/utils.h b/dnn/src/common/utils.h new file mode 100644 index 00000000..500fc98a --- /dev/null +++ b/dnn/src/common/utils.h @@ -0,0 +1,533 @@ +/** + * \file dnn/src/common/utils.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/arch.h" +#include "megdnn/basic_types.h" +#include "megdnn/dtype.h" +#include "megdnn/handle.h" +#include "megdnn/thin/small_vector.h" + +#include "src/common/utils.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) +#include +#endif + +#if __cplusplus >= 201703L || __clang_major__ >= 4 + #define MEGDNN_FALLTHRU [[fallthrough]]; +#elif __GNUC__ >= 7 + #define MEGDNN_FALLTHRU __attribute__ ((fallthrough)); +#else + #define MEGDNN_FALLTHRU +#endif + +#define rep(i, n) for (auto i = decltype(n){0}; i < (n); ++i) + +#define megdnn_assert_contiguous(layout) \ + do { \ + megdnn_assert((layout).is_contiguous(), "%s is %s.", #layout, \ + (layout).to_string().c_str()); \ + } while (0) + +#define megdnn_assert_non_overlapping_strong(layout) \ + do { \ + megdnn_assert((layout).is_non_overlapping_strong(), "%s is %s.", \ + #layout, (layout).to_string().c_str()); \ + } while (0) + +#define megdnn_assert_eq_size_t(lhs_, rhs_) \ + do { \ + size_t lhs = lhs_, rhs = rhs_; \ + megdnn_assert(lhs == rhs, "%s is %zu, %s is %zu.", #lhs_, lhs, #rhs_, \ + rhs); \ + } while (0) + +#define megdnn_assert_eq_layout(lhs, rhs) \ + do { \ + megdnn_assert(lhs.eq_layout(rhs), "%s is %s, %s is %s.", #lhs, \ + lhs.to_string().c_str(), #rhs, rhs.to_string().c_str()); \ + } while (0) + +#define megdnn_assert_eq_shape(lhs, rhs) \ + do { \ + megdnn_assert(lhs.eq_shape(rhs), "%s is %s, %s is %s.", #lhs, \ + lhs.to_string().c_str(), #rhs, rhs.to_string().c_str()); \ + } while (0) + +#define megdnn_assert_eq_dtype(lhs, rhs) \ + do { \ + megdnn_assert(lhs.dtype == rhs.dtype, "%s is %s, %s is %s.", #lhs, \ + lhs.dtype.name(), #rhs, rhs.dtype.name()); \ + } while (0) + +#define megdnn_layout_msg(layout) \ + std::string(megdnn_mangle(#layout "=" + (layout).to_string())) + +#define MEGDNN_LOCK_GUARD(var) \ + std::lock_guard> _lock_guard_##var { var } + +namespace megdnn { + +/* ================ logging ================ */ +#define megdnn_log_debug(fmt...) \ + _megdnn_do_log(::megdnn::LogLevel::DEBUG, __FILE__, __func__, __LINE__, fmt) +#define megdnn_log(fmt...) \ + _megdnn_do_log(::megdnn::LogLevel::INFO, __FILE__, __func__, __LINE__, fmt) +#define megdnn_log_warn(fmt...) \ + _megdnn_do_log(::megdnn::LogLevel::WARN, __FILE__, __func__, __LINE__, fmt) +#define megdnn_log_error(fmt...) \ + _megdnn_do_log(::megdnn::LogLevel::ERROR, __FILE__, __func__, __LINE__, fmt) + +#if MEGDNN_ENABLE_LOGGING +void __log__(LogLevel level, const char* file, const char* func, int line, + const char* fmt, ...) __attribute__((format(printf, 5, 6))); + +#define _megdnn_do_log ::megdnn::__log__ +#else +#define _megdnn_do_log(...) \ + do { \ + } while (0) +#endif // megdnn_ENABLE_LOGGING + +/* helper functions */ +/** + * \brief Get the next `stride' index lexicographically. + * + * stride must be divisible by the last dimension shape. + * \return true if index is updated successfully, false otherwise (index is + * already the last one, next index does not exist) + */ +bool get_next_addr(size_t* index, const size_t* shape, size_t n, + size_t stride = 1); +size_t get_linear_addr(size_t* index, const size_t* shape, size_t n); +int get_linear_addr_noncont(size_t* index, const TensorLayout& layout); +size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad, + bool is_floor = true); +void infer_conv_shape2d(size_t ih, size_t iw, size_t fh, size_t fw, size_t sh, + size_t sw, size_t ph, size_t pw, size_t& oh, size_t& ow, + bool is_floor = true); +template +SmallVector apply_vector(Func&& func, const SmallVector& vec); +std::string ssprintf(const char* fmt, ...) + __attribute__((format(printf, 1, 2))); + +/*! + * \brief transpose (m*n) matrix to (n*m) matrix + * + * -1 in \p lds and \p ldd means default leading dimensions (= nr. columns) + * + * Note that transpose and transpose_knc2nsck are implemented in x86/utils.cpp + * and arm_common/util.cpp, subject to the target platform. + * + */ +template +void transpose(const dtype* src, dtype* dst, size_t m, size_t n, + ptrdiff_t lds = -1, ptrdiff_t ldd = -1); + +/*! + * transpose src with contiguous layout (k, n, c) into dst with shape + * (n, c, k), with given stride (\p n_stride) on first dimension + */ +template +void transpose_knc2nsck(const dtype* src, dtype* dst, size_t k, size_t n, + size_t c, size_t n_stride); + +/*! + * \brief divide get result ceiled to int; both dividend and divisor shoud be + * non-negative + */ +template +int_t div_ceil(int_t dividend, int_t divisor); + +/*! + * \brief divide get result floored to int; both dividend and divisor shoud be + * non-negative + */ +template +int_t div_floor(int_t dividend, int_t divisor); + +/*! + * \brief get geometric mean of a and b + */ +inline dt_float32 geometric_mean(dt_float32 a, dt_float32 b) { + return std::sqrt(a * b); +} + +/*! + * \brief calculate x*x + */ +template +num_t sqr(num_t x) { + return x * x; +} + +template +std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +/** + * \brief Aligned workspace bundle. + * + * Each individual workspace is aligned to align_in_bytes. + */ +class WorkspaceBundle { +public: + WorkspaceBundle(void* ptr, SmallVector sizes_in_bytes, + size_t align_in_bytes = 512); + /** + * \returns raw workspace ptr. + * + * Note that ptr() is different than get(0), in that + * the result of ptr() is possibly not aligned. + */ + void* ptr() const; + /** + * \returns the i-th workspace ptr (aligned) + */ + void* get(size_t i) const; + /** + * \returns total size taking into account paddings to solve alignment + * issue. + */ + size_t total_size_in_bytes() const; + size_t get_size(size_t i) const; + size_t nr_workspace() const; + void set(void* ptr); + + Workspace get_workspace(size_t i) const { + return {static_cast(get(i)), get_size(i)}; + } + +private: + void* m_ptr; + SmallVector m_sizes; + SmallVector m_aligned_sizes; + size_t m_align_in_bytes; +}; + +MEGDNN_CONSTEXPR std::size_t operator"" _z(unsigned long long n) { + return n; +} + +template +std::string vec2str(Vec&& vec) { + std::string res; + res.append("{"); + for (size_t i = 0; i < vec.size(); ++i) { + res.append(std::to_string(vec[i])); + if (i + 1 < vec.size()) + res.append(","); + } + res.append("}"); + return res; +} + +// facilitate tile and repeat +size_t count_not_ones_in_shape(const TensorShape& shape); + +/*! + * \brief whether a TensorLayout is of NHWC format and contiguous on the W and + * C dimensions. + * + * if true, it implies that a TensorND with given layout is convertible to + * a Mat for the use of cv algorithms. + */ +bool is_nhwc_contig_wc(const TensorLayout& layout); + +static inline void copy_plane_in_bytes(void* dst, const void* src, + size_t height, size_t width, + size_t stride_dst, size_t stride_src) { + for (size_t h = 0; h < height; ++h) { + std::memcpy(static_cast(dst) + h * stride_dst, + static_cast(src) + h * stride_src, + width); + } +} + +megcoreDeviceHandle_t get_device_handle(Handle* handle); + +static inline void incr_voidp(void*& ptr, ptrdiff_t delta) { + ptr = reinterpret_cast(reinterpret_cast(ptr) + delta); +} + +/*! + * \brief align *val* to be multiples of *align* + * \param align required alignment, which must be power of 2 + */ +template +static inline T get_aligned_power2(T val, T align) { + auto d = val & (align - 1); + val += (align - d) & (align - 1); + return val; +} + +template +inline T saturate(S x, S lower, S upper) { + //! in(nan) -> out(lower) : + //! match the meaning with fmax(in dtype.h) when dealing with nan + S val = x > upper ? upper : (x >= lower ? x : lower); + return static_cast(val); +} + +/*! + * \brief divide get result ceiled to int; both dividend and divisor shoud be + * non-negative + */ +template +int_t div_ceil(int_t dividend, int_t divisor) { + static_assert(std::is_integral::value, "must be integers"); + megdnn_assert_internal(dividend >= 0); + megdnn_assert_internal(divisor > 0); + return (dividend + divisor - 1) / divisor; +} + +/*! + * \brief divide get result floored to int; both dividend and divisor shoud be + * non-negative + */ +template +int_t div_floor(int_t dividend, int_t divisor) { + static_assert(std::is_integral::value, "must be integers"); + megdnn_assert_internal(dividend >= 0); + megdnn_assert_internal(divisor > 0); + return dividend / divisor; +} + +/*! + * \brief round result to multiply of divisor; both dividend and divisor shoud + * be non-negative + */ +template +int_t round_up(int_t dividend, int_t divisor) { + static_assert(std::is_integral::value, "must be integers"); + megdnn_assert_internal(dividend >= 0); + megdnn_assert_internal(divisor > 0); + return ((dividend + divisor - 1) / divisor) * divisor; +} + +template +SmallVector apply_vector(Func&& func, const SmallVector& vec) { + SmallVector res(vec.size()); + std::transform(vec.begin(), vec.end(), res.begin(), func); + return res; +} + +template +struct SafeMultiplies; + +template +struct _SafeMultipliesImplUnsigned : public std::binary_function { + static MEGDNN_CONSTEXPR size_t nbits = sizeof(T) * 8; + + static size_t clz(unsigned x) { + size_t n; +#if defined(_MSC_VER) + DWORD leading_zero; + _BitScanReverse(&leading_zero, x); + n = 31 - leading_zero; +#else + n = __builtin_clz(x); +#endif + return x ? n : nbits; + } + + static size_t clz(unsigned long x) { + size_t n; +#if defined(_MSC_VER) + DWORD leading_zero; + _BitScanReverse(&leading_zero, x); + n = 31 - leading_zero; +#else + n = __builtin_clzl(x); +#endif + return x ? n : nbits; + } + + static size_t clz(unsigned long long x) { + size_t n; +#if defined(_MSC_VER) + DWORD leading_zero; + _BitScanReverse64(&leading_zero, x); + n = 63 - leading_zero; +#else + n = __builtin_clzll(x); +#endif + return x ? n : nbits; + } + + T operator()(const T& x, const T& y) const { + int overflow = clz(x) + clz(y) + 2 <= nbits; + T t = x * (y >> 1); // clz(x)+clz(y/2) >= nbits, t must not overflow + overflow |= t >> (nbits - 1); + t <<= 1; + auto yodd = y & 1; + t += yodd ? x : 0; + overflow |= yodd & (t < x); + + megdnn_assert(!overflow, "multiply overflow: %s %s", + std::to_string(x).c_str(), std::to_string(y).c_str()); + return t; + } + + template + U operator()(const U&, const V&) const { + static_assert( + // can not be true + std::is_same::value && std::is_same::value, + "implicit conversion disallowed in SafeMultiplies"); + megdnn_trap(); + } +}; + +template <> +struct SafeMultiplies : public _SafeMultipliesImplUnsigned {}; + +template +bool vec_contains(const std::vector& vec, const T& elem) { + return std::find(vec.begin(), vec.end(), elem) != vec.end(); +} + +template +bool vec_contains(const SmallVector& vec, const T& elem) { + return std::find(vec.begin(), vec.end(), elem) != vec.end(); +} + +float mul_scale(DType lhs, DType rhs); + +template +dtype convert(stype src, dtype dst, size_t offset); + +template <> +uint8_t convert(dt_quint4 src, uint8_t dst, size_t offset); + +template <> +dt_quint4 convert(uint8_t src, dt_quint4 dst, size_t offset); + +template <> +int8_t convert(dt_qint4 src, int8_t dst, size_t offset); + +template <> +dt_qint4 convert(int8_t src, dt_qint4 dst, size_t offset); + +/** + * \brief N-dimensional index space + */ +class CpuNDRange { + static MEGDNN_CONSTEXPR size_t MAX_NDIM = MEGDNN_MAX_NDIM; + +private: + size_t m_dim[MAX_NDIM]; + size_t m_dimension; + +public: + //! \brief Constructs seven-dimensional range. + CpuNDRange(size_t size0, size_t size1, size_t size2, size_t size3, + size_t size4, size_t size5, size_t size6) + : m_dimension(7) { + m_dim[0] = size0; + m_dim[1] = size1; + m_dim[2] = size2; + m_dim[3] = size3; + m_dim[4] = size4; + m_dim[5] = size5; + m_dim[6] = size6; + } + //! \brief Constructs range has zero dimensions. + CpuNDRange() : CpuNDRange(1, 1, 1, 1, 1, 1, 1) { m_dimension = 0; } + + //! \brief Constructs one-dimensional range. + CpuNDRange(size_t size0) : CpuNDRange(size0, 1, 1, 1, 1, 1, 1) { + m_dimension = 1; + } + + //! \brief Constructs two-dimensional range. + CpuNDRange(size_t size0, size_t size1) + : CpuNDRange(size0, size1, 1, 1, 1, 1, 1) { + m_dimension = 2; + } + + //! \brief Constructs three-dimensional range. + CpuNDRange(size_t size0, size_t size1, size_t size2) + : CpuNDRange(size0, size1, size2, 1, 1, 1, 1) { + m_dimension = 3; + } + + //! \brief Constructs four-dimensional range. + CpuNDRange(size_t size0, size_t size1, size_t size2, size_t size3) + : CpuNDRange(size0, size1, size2, size3, 1, 1, 1) { + m_dimension = 4; + } + + //! \brief Constructs five-dimensional range. + CpuNDRange(size_t size0, size_t size1, size_t size2, size_t size3, + size_t size4) + : CpuNDRange(size0, size1, size2, size3, size4, 1, 1) { + m_dimension = 5; + } + + //! \brief Constructs six-dimensional range. + CpuNDRange(size_t size0, size_t size1, size_t size2, size_t size3, + size_t size4, size_t size5) + : CpuNDRange(size0, size1, size2, size3, size4, size5, 1) { + m_dimension = 6; + } + + //! \brief Constructs every dim from global + CpuNDRange(const CpuNDRange& dims, size_t global) { + m_dimension = dims.dimension(); + for (int i = m_dimension - 1; i >= 0; i--) { + m_dim[i] = global % dims[i]; + global /= dims[i]; + } + } + + //! \brief Queries the number of dimensions in the range. + size_t dimension() const { return m_dimension; } + + //! \brief Returns the size of the object in bytes based on the + // runtime number of dimensions + size_t size() const { return m_dimension * sizeof(size_t); } + + size_t* get() { return m_dimension ? m_dim : nullptr; } + + size_t& operator[](size_t idx); + size_t& operator[](size_t idx) const { + return const_cast(this)->operator[](idx); + }; + + const size_t* get() const { return const_cast(this)->get(); } + + size_t total_size() const { + size_t ret = 1; + for (size_t i = 0; i < m_dimension; i++) { + ret *= m_dim[i]; + } + return ret; + } + + //! \brief get the dims string + std::string to_string() const; +}; + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/version.cpp b/dnn/src/common/version.cpp new file mode 100644 index 00000000..5baea187 --- /dev/null +++ b/dnn/src/common/version.cpp @@ -0,0 +1,23 @@ +/** + * \file dnn/src/common/version.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/version.h" +#include "src/common/version_symbol.h" + +using namespace megdnn; + +Version megdnn::get_version() { + return {MEGDNN_MAJOR, MEGDNN_MINOR, MEGDNN_PATCH}; +} + +MEGDNN_VERSION_SYMBOL3(MEGDNN, MEGDNN_MAJOR, MEGDNN_MINOR, MEGDNN_PATCH); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/version_symbol.h b/dnn/src/common/version_symbol.h new file mode 100644 index 00000000..2d6577b8 --- /dev/null +++ b/dnn/src/common/version_symbol.h @@ -0,0 +1,31 @@ +/** + * \file dnn/src/common/version_symbol.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#define MEGDNN_VERSION_SYMBOL_(name, ver) \ + int MEGDNN_VSYM_##name##_##ver __attribute__((visibility("default"))) + +/*! + * This macro should be placed in a .cpp file. A symbol would be inserted in the + * produced binary with the name MEGDNN_VERSION_`name`_`ver` + */ +#define MEGDNN_VERSION_SYMBOL(name, ver) MEGDNN_VERSION_SYMBOL_(name, ver) + +//! helper macro +#define MEGDNN_VERSION_SYMBOL3_(name, ver0, ver1, ver2) \ + MEGDNN_VERSION_SYMBOL_(name, ver0##_##ver1##_##ver2) + +//! concat three symbols (usually used for version major, minor and patch) +#define MEGDNN_VERSION_SYMBOL3(name, ver0, ver1, ver2) \ + MEGDNN_VERSION_SYMBOL3_(name, ver0, ver1, ver2) + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/warp_affine.cpp b/dnn/src/common/warp_affine.cpp new file mode 100644 index 00000000..475bd874 --- /dev/null +++ b/dnn/src/common/warp_affine.cpp @@ -0,0 +1,179 @@ +/** + * \file dnn/src/common/warp_affine.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void WarpAffineBase::check_layout_fwd(const TensorLayout& src, + const TensorLayout& mat, + const TensorLayout& dst) { + megdnn_assert_contiguous(mat); + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(mat) + ", " + + megdnn_layout_msg(dst); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert(mat.ndim == 3_z, "%s", errmsg().c_str()); + megdnn_assert(src.shape[0] == mat.shape[0], "%s", errmsg().c_str()); + megdnn_assert(src.shape[0] == dst.shape[0], "%s", errmsg().c_str()); + megdnn_assert(mat.shape[1] == 2_z, "%s", errmsg().c_str()); + megdnn_assert(mat.shape[2] == 3_z, "%s", errmsg().c_str()); + megdnn_assert(dst.dtype == src.dtype); + + if (param().format == Param::Format::NCHW) { + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(dst.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(src.dtype.enumv() == DTypeEnum::Float32 || + MEGDNN_FLOAT16_SELECT( + src.dtype.enumv() == DTypeEnum::Float16, + false) || + src.dtype.enumv() == DTypeEnum::Int8 || + src.dtype.enumv() == DTypeEnum::Uint8 || + (src.dtype.enumv() == DTypeEnum::QuantizedS8 || + src.dtype.enumv() == DTypeEnum::Quantized8Asymm), + "WarpAffine NCHW input dtype should be " + "Float32/Int8/Uint8/QInt8/QUint8" MEGDNN_FLOAT16_SELECT( + "/Float16", "") "."); + megdnn_assert( + (src.dtype.category() == DTypeCategory::FLOAT && + (src.dtype == mat.dtype || + mat.dtype.enumv() == DTypeEnum::Float32)) || + ((src.dtype.category() == DTypeCategory::INT || + src.dtype.category() == DTypeCategory::QUANTIZED) && + mat.dtype.enumv() == DTypeEnum::Float32), + "The input to WarpAffine is in NCHW format, in this " + "case, if the input dtype is floating point, the " + "transformation matrix should have same dtype as the " + "input, otherwise, it should be in Float32, %s given.", + mat.dtype.name()); + + megdnn_assert(src.shape[1] == dst.shape[1], "%s", errmsg().c_str()); + megdnn_assert(param().imode == + param::WarpPerspective::InterpolationMode::LINEAR); + megdnn_assert(param().border_mode != + param::WarpPerspective::BorderMode::TRANSPARENT); + megdnn_assert(param().border_mode != + param::WarpPerspective::BorderMode::ISOLATED); + + } else if (param().format == Param::Format::NHWC) { + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(dst.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(src.shape[3] == dst.shape[3], "%s", errmsg().c_str()); + megdnn_assert(param().imode != + param::WarpPerspective::InterpolationMode::AREA); + } else { + megdnn_assert(src.shape[2] == dst.shape[2], "%s", errmsg().c_str()); + megdnn_assert(src.ndim == 5_z, "%s", errmsg().c_str()); + megdnn_assert(dst.ndim == 5_z, "%s", errmsg().c_str()); + megdnn_assert(param().format == Param::Format::NHWCD4); + megdnn_assert(param().imode == + param::WarpPerspective::InterpolationMode::LINEAR); + megdnn_assert(param().border_mode != + param::WarpPerspective::BorderMode::TRANSPARENT); + megdnn_assert(param().border_mode != + param::WarpPerspective::BorderMode::ISOLATED); + } +} + +void WarpAffine::check_exec(const TensorLayout& src, const TensorLayout& mat, + const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, mat, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, mat, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +std::string WarpAffineBase::param_msg() const { + std::string res; + res.append(megdnn_mangle("imode=")); + switch (param().imode) { + case InterpolationMode::NEAREST: + res.append(megdnn_mangle("NEAREST")); + break; + case InterpolationMode::LINEAR: + res.append(megdnn_mangle("LINEAR")); + break; + case InterpolationMode::AREA: + res.append(megdnn_mangle("AREA")); + break; + case InterpolationMode::CUBIC: + res.append(megdnn_mangle("CUBIC")); + break; + case InterpolationMode::LANCZOS4: + res.append(megdnn_mangle("LANCZOS4")); + break; + } + res.append(megdnn_mangle("bmode=")); + switch (param().border_mode) { + case BorderMode::WRAP: + res.append(megdnn_mangle("WRAP")); + break; + case BorderMode::CONSTANT: + res.append(megdnn_mangle("CONSTANT")); + break; + case BorderMode::REFLECT: + res.append(megdnn_mangle("REFLECT")); + break; + case BorderMode::REFLECT_101: + res.append(megdnn_mangle("REFLECT_101")); + break; + case BorderMode::REPLICATE: + res.append(megdnn_mangle("REPLICATE")); + break; + case BorderMode::TRANSPARENT: + res.append(megdnn_mangle("TRANSPARENT")); + break; + case BorderMode::ISOLATED: + res.append(megdnn_mangle("ISOLATED")); + break; + } + if (param().border_mode == BorderMode::CONSTANT) { + res.append(", " + std::to_string(param().border_val)); + } + return res; +} + +int WarpAffineBase::get_real_coord(int p, int len) { + auto bmode = param().border_mode; + if ((unsigned)p < (unsigned)len) + ; + else if (bmode == BorderMode::REPLICATE) + p = p < 0 ? 0 : len - 1; + else if (bmode == BorderMode::REFLECT || bmode == BorderMode::REFLECT_101) { + int delta = (bmode == BorderMode::REFLECT_101); + if (len == 1) + return 0; + do { + if (p < 0) + p = -p - 1 + delta; + else + p = len - 1 - (p - len) - delta; + } while ((unsigned)p >= (unsigned)len); + } else if (bmode == BorderMode::WRAP) { + if (p < 0) + p -= ((p - len + 1) / len) * len; + /* + if( p >= len ) + p %= len; + */ + while (p >= len) { + p -= len; + } + } else if (bmode == BorderMode::CONSTANT) + p = -1; + return p; +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/warp_common.cpp b/dnn/src/common/warp_common.cpp new file mode 100644 index 00000000..93ad1110 --- /dev/null +++ b/dnn/src/common/warp_common.cpp @@ -0,0 +1,37 @@ +/** + * \file dnn/src/common/warp_common.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/warp_common.h" + +using namespace megdnn; + +bool warp::is_cv_available(const TensorLayout& src, const TensorLayout& /*mat*/, + const TensorLayout& /*dst*/, + param::WarpAffine::InterpolationMode imode, + param::WarpAffine::Format format) { + return format == param::WarpAffine::Format::NHWC && + (src[3] == 1 || src[3] == 2 || src[3] == 3) && + (src.dtype == dtype::Float32() || src.dtype == dtype::Uint8()) && + (imode == param::WarpAffine::InterpolationMode::NEAREST || + imode == param::WarpAffine::InterpolationMode::LINEAR || + imode == param::WarpAffine::InterpolationMode::CUBIC || + imode == param::WarpAffine::InterpolationMode::LANCZOS4); +} + +bool warp::is_dnn_available(const TensorLayout& /*src*/, + const TensorLayout& /*mat*/, + const TensorLayout& /*dst*/, + param::WarpAffine::InterpolationMode imode, + param::WarpAffine::Format /*format*/) { + return imode == param::WarpAffine::InterpolationMode::LINEAR; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/warp_common.h b/dnn/src/common/warp_common.h new file mode 100644 index 00000000..efa719f6 --- /dev/null +++ b/dnn/src/common/warp_common.h @@ -0,0 +1,958 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/warp_common.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ +#pragma once +#include "megdnn/dtype.h" +#include "src/common/cv/common.h" +#include "src/common/cv/helper.h" +#include "src/common/cv/interp_helper.h" +#include "src/common/rounding_converter.cuh" +#include "src/common/utils.h" + +#include "include/megdnn/oprs.h" +#include "midout.h" + +#if MEGDNN_X86 +#include +#endif + +MIDOUT_DECL(megdnn_warp) +MIDOUT_DECL(remapBilinear_bmode) +MIDOUT_DECL(remapBilinear_ch) + +namespace megdnn { +namespace warp { + +bool is_cv_available(const TensorLayout& src, const TensorLayout& mat, + const TensorLayout& dst, + param::WarpAffine::InterpolationMode imode, + param::WarpAffine::Format format); + +bool is_dnn_available(const TensorLayout&, const TensorLayout&, + const TensorLayout&, + param::WarpAffine::InterpolationMode imode, + param::WarpAffine::Format format); + +using namespace megcv; +using IMode = InterpolationMode; +using BMode = BorderMode; +using InterpTable = InterpolationTable<>; +constexpr int INTER_REMAP_COEF_BITS = InterpTable::INTER_REMAP_COEF_BITS; +constexpr int INTER_BITS = InterpTable::INTER_BITS; +constexpr int INTER_TAB_SIZE = InterpTable::INTER_TAB_SIZE; +constexpr int INTER_TAB_SIZE2 = InterpTable::INTER_TAB_SIZE2; +constexpr int INTER_REMAP_COEF_SCALE = InterpTable::INTER_REMAP_COEF_SCALE; + +template +struct RemapVec { + int operator()(const Mat&, void*, const short*, const ushort*, + const void*, int) const { + return 0; + } +}; + +#if MEGDNN_X86 + +template +struct RemapVec { + int operator()(const Mat8u& _src, void* _dst, const short* XY, + const ushort* FXY, const void* _wtab, int width) const { + int x = 0, sstep = (int)_src.step(); + + if ((CH != 1 && CH != 3) || sstep > 0x8000) + return 0; + + const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1); + const short* wtab = CH == 1 ? (const short*)_wtab + : InterpTable::get_linear_ic4_table(); + uchar* D = (uchar*)_dst; + __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE / 2); + __m128i xy2ofs = _mm_set1_epi32(CH + (sstep << 16)); + __m128i z = _mm_setzero_si128(); + alignas(16) int iofs0[4]; + alignas(16) int iofs1[4]; + + if (CH == 1) { + for (; x <= width - 8; x += 8) { + __m128i xy0 = _mm_loadu_si128((const __m128i*)(XY + x * 2)); + __m128i xy1 = _mm_loadu_si128((const __m128i*)(XY + x * 2 + 8)); + __m128i v0, v1, v2, v3, a0, a1, b0, b1; + unsigned i0, i1; + + xy0 = _mm_madd_epi16(xy0, xy2ofs); + xy1 = _mm_madd_epi16(xy1, xy2ofs); + _mm_store_si128((__m128i*)iofs0, xy0); + _mm_store_si128((__m128i*)iofs1, xy1); + + i0 = *(ushort*)(S0 + iofs0[0]) + + (*(ushort*)(S0 + iofs0[1]) << 16); + i1 = *(ushort*)(S0 + iofs0[2]) + + (*(ushort*)(S0 + iofs0[3]) << 16); + v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), + _mm_cvtsi32_si128(i1)); + i0 = *(ushort*)(S1 + iofs0[0]) + + (*(ushort*)(S1 + iofs0[1]) << 16); + i1 = *(ushort*)(S1 + iofs0[2]) + + (*(ushort*)(S1 + iofs0[3]) << 16); + v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), + _mm_cvtsi32_si128(i1)); + v0 = _mm_unpacklo_epi8(v0, z); + v1 = _mm_unpacklo_epi8(v1, z); + + a0 = _mm_unpacklo_epi32( + _mm_loadl_epi64((__m128i*)(wtab + FXY[x] * 4)), + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 1] * 4))); + a1 = _mm_unpacklo_epi32( + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 2] * 4)), + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 3] * 4))); + b0 = _mm_unpacklo_epi64(a0, a1); + b1 = _mm_unpackhi_epi64(a0, a1); + v0 = _mm_madd_epi16(v0, b0); + v1 = _mm_madd_epi16(v1, b1); + v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta); + + i0 = *(ushort*)(S0 + iofs1[0]) + + (*(ushort*)(S0 + iofs1[1]) << 16); + i1 = *(ushort*)(S0 + iofs1[2]) + + (*(ushort*)(S0 + iofs1[3]) << 16); + v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), + _mm_cvtsi32_si128(i1)); + i0 = *(ushort*)(S1 + iofs1[0]) + + (*(ushort*)(S1 + iofs1[1]) << 16); + i1 = *(ushort*)(S1 + iofs1[2]) + + (*(ushort*)(S1 + iofs1[3]) << 16); + v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), + _mm_cvtsi32_si128(i1)); + v2 = _mm_unpacklo_epi8(v2, z); + v3 = _mm_unpacklo_epi8(v3, z); + + a0 = _mm_unpacklo_epi32( + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 4] * 4)), + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 5] * 4))); + a1 = _mm_unpacklo_epi32( + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 6] * 4)), + _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 7] * 4))); + b0 = _mm_unpacklo_epi64(a0, a1); + b1 = _mm_unpackhi_epi64(a0, a1); + v2 = _mm_madd_epi16(v2, b0); + v3 = _mm_madd_epi16(v3, b1); + v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta); + + v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS); + v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS); + v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z); + _mm_storel_epi64((__m128i*)(D + x), v0); + } + } else if (CH == 3) { + for (; x <= width - 5; x += 4, D += 12) { + __m128i xy0 = _mm_loadu_si128((const __m128i*)(XY + x * 2)); + __m128i u0, v0, u1, v1; + + xy0 = _mm_madd_epi16(xy0, xy2ofs); + _mm_store_si128((__m128i*)iofs0, xy0); + const __m128i *w0, *w1; + w0 = (const __m128i*)(wtab + FXY[x] * 16); + w1 = (const __m128i*)(wtab + FXY[x + 1] * 16); + + u0 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3))); + v0 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3))); + u1 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3))); + v1 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3))); + u0 = _mm_unpacklo_epi8(u0, z); + v0 = _mm_unpacklo_epi8(v0, z); + u1 = _mm_unpacklo_epi8(u1, z); + v1 = _mm_unpacklo_epi8(v1, z); + u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), + _mm_madd_epi16(v0, w0[1])); + u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), + _mm_madd_epi16(v1, w1[1])); + u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), + INTER_REMAP_COEF_BITS); + u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), + INTER_REMAP_COEF_BITS); + u0 = _mm_slli_si128(u0, 4); + u0 = _mm_packs_epi32(u0, u1); + u0 = _mm_packus_epi16(u0, u0); + _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0, 1)); + + w0 = (const __m128i*)(wtab + FXY[x + 2] * 16); + w1 = (const __m128i*)(wtab + FXY[x + 3] * 16); + + u0 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3))); + v0 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3))); + u1 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), + _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3))); + v1 = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), + _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3))); + u0 = _mm_unpacklo_epi8(u0, z); + v0 = _mm_unpacklo_epi8(v0, z); + u1 = _mm_unpacklo_epi8(u1, z); + v1 = _mm_unpacklo_epi8(v1, z); + u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), + _mm_madd_epi16(v0, w0[1])); + u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), + _mm_madd_epi16(v1, w1[1])); + u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), + INTER_REMAP_COEF_BITS); + u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), + INTER_REMAP_COEF_BITS); + u0 = _mm_slli_si128(u0, 4); + u0 = _mm_packs_epi32(u0, u1); + u0 = _mm_packus_epi16(u0, u0); + _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0, 1)); + } + } + + return x; + } +}; +#endif + +template +using RemapNNFunc = void (*)(const Mat& _src, Mat& _dst, + const Mat& _xy, const T* bvalue); +template +using RemapFunc = void (*)(const Mat& _src, Mat& _dst, + const Mat& _xy, const Mat& _fxy, + const void* _wtab, const T* bvalue); + +template +static void remapNearest(const Mat& _src, Mat& _dst, + const Mat& _xy, const T* bvalue) { + const T* S0 = _src.ptr(); + size_t sstep = _src.step(); + int dx, dy; + int width1 = _src.width(), height1 = _src.height(); + int swidth = _src.width(), sheight = _src.height(); + int dwidth = _dst.width(), dheight = _dst.height(); + if (_dst.is_continuous() && _xy.is_continuous()) { + dwidth *= dheight; + dheight = 1; + } + for (dy = 0; dy < dheight; dy++) { + T* D = _dst.ptr(dy); + const short* XY = _xy.ptr(dy); + if (CH == 1) { + for (dx = 0; dx < dwidth; dx++) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + if ((unsigned)sx < (unsigned)width1 && + (unsigned)sy < (unsigned)height1) { + D[dx] = S0[sy * sstep + sx]; + } else { + if (bmode == BMode::BORDER_REPLICATE) { + sx = saturate(sx, 0, swidth); + sy = saturate(sy, 0, sheight); + D[dx] = S0[sy * sstep + sx]; + } else if (bmode == BMode::BORDER_CONSTANT) + D[dx] = bvalue[0]; + else if (bmode != BMode::BORDER_TRANSPARENT) { + sx = border_interpolate(sx, swidth); + sy = border_interpolate(sy, sheight); + D[dx] = S0[sy * sstep + sx]; + } + } + } + } else { + for (dx = 0; dx < dwidth; dx++, D += CH) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + const T* S; + if ((unsigned)sx < (unsigned)width1 && + (unsigned)sy < (unsigned)height1) { + S = S0 + sy * sstep + sx * CH; + for (size_t i = 0; i < CH; i++) { + D[i] = S[i]; + } + } else if (bmode != BMode::BORDER_TRANSPARENT) { + if (bmode == BMode::BORDER_REPLICATE) { + sx = saturate(sx, 0, swidth); + sy = saturate(sy, 0, sheight); + S = S0 + sy * sstep + sx * CH; + } else if (bmode == BMode::BORDER_CONSTANT) + S = bvalue; + else { + sx = border_interpolate(sx, swidth); + sy = border_interpolate(sy, sheight); + S = S0 + sy * sstep + sx * CH; + } + for (size_t i = 0; i < CH; i++) { + D[i] = S[i]; + } + } + } + } + } +} + +template +static void remapBicubic(const Mat& _src, Mat& _dst, + const Mat& _xy, const Mat& _fxy, + const void* _wtab, const T* bvalue) { + typedef typename CastOp::type1 WT; + const AT* wtab = (const AT*)_wtab; + const T* S0 = _src.ptr(); + size_t sstep = _src.step(); + int dx, dy; + CastOp castOp; + int swidth = _src.width(), sheight = _src.height(); + int dwidth = _dst.width(), dheight = _dst.height(); + unsigned width1 = std::max(swidth - 3, 0), + height1 = std::max(sheight - 3, 0); + if (_dst.is_continuous() && _xy.is_continuous() && _fxy.is_continuous()) { + dwidth *= dheight; + dheight = 1; + } + for (dy = 0; dy < dheight; dy++) { + T* D = _dst.ptr(dy); + const short* XY = _xy.ptr(dy); + const ushort* FXY = _fxy.ptr(dy); + for (dx = 0; dx < dwidth; dx++, D += CH) { + int sx = XY[dx * 2] - 1, sy = XY[dx * 2 + 1] - 1; + const AT* w = wtab + FXY[dx] * 16; + size_t i, k; + if ((unsigned)sx < width1 && (unsigned)sy < height1) { + const T* S = S0 + sy * sstep + sx * CH; + for (k = 0; k < CH; k++) { + WT sum = S[0] * w[0] + S[CH] * w[1] + S[CH * 2] * w[2] + + S[CH * 3] * w[3]; + S += sstep; + sum += S[0] * w[4] + S[CH] * w[5] + S[CH * 2] * w[6] + + S[CH * 3] * w[7]; + S += sstep; + sum += S[0] * w[8] + S[CH] * w[9] + S[CH * 2] * w[10] + + S[CH * 3] * w[11]; + S += sstep; + sum += S[0] * w[12] + S[CH] * w[13] + S[CH * 2] * w[14] + + S[CH * 3] * w[15]; + S += 1 - sstep * 3; + D[k] = castOp(sum); + } + } else { + int x[4], y[4]; + if (bmode == BMode::BORDER_TRANSPARENT && + ((unsigned)(sx + 1) >= (unsigned)swidth || + (unsigned)(sy + 1) >= (unsigned)sheight)) + continue; + if (bmode == BMode::BORDER_CONSTANT && + (sx >= swidth || sx + 4 <= 0 || sy >= sheight || + sy + 4 <= 0)) { + for (size_t i = 0; i < CH; i++) { + D[i] = bvalue[i]; + } + continue; + } + for (i = 0; i < 4; i++) { + x[i] = border_interpolate(sx + i, swidth) * CH; + y[i] = border_interpolate(sy + i, sheight); + } + for (k = 0; k < CH; k++, S0++, w -= 16) { + WT cv = bvalue[k], sum = cv * ONE; + for (i = 0; i < 4; i++, w += 4) { + int yi = y[i]; + const T* S = S0 + yi * sstep; + if (yi < 0) + continue; + if (x[0] >= 0) + sum += (S[x[0]] - cv) * w[0]; + if (x[1] >= 0) + sum += (S[x[1]] - cv) * w[1]; + if (x[2] >= 0) + sum += (S[x[2]] - cv) * w[2]; + if (x[3] >= 0) + sum += (S[x[3]] - cv) * w[3]; + } + D[k] = castOp(sum); + } + S0 -= CH; + } + } + } +} + +template +static void remapBilinear(const Mat& _src, Mat& _dst, + const Mat& _xy, const Mat& _fxy, + const void* _wtab, const T* bvalue) { + MIDOUT_BEGIN(remapBilinear_bmode, midout_iv(bmode)) { + typedef typename CastOp::type1 WT; + const AT* wtab = (const AT*)_wtab; + const T* S0 = _src.ptr(); + size_t sstep = _src.step(); + int dx, dy; + CastOp castOp; + VecOp vecOp; + int swidth = _src.width(), sheight = _src.height(); + int dwidth = _dst.width(), dheight = _dst.height(); + unsigned width1 = std::max(swidth - 1, 0), + height1 = std::max(sheight - 1, 0); + for (dy = 0; dy < dheight; dy++) { + T* D = _dst.ptr(dy); + const short* XY = _xy.ptr(dy); + const ushort* FXY = _fxy.ptr(dy); + int X0 = 0; + bool prevInlier = false; + + for (dx = 0; dx <= dwidth; dx++) { + bool curInlier = + dx < dwidth ? (unsigned)XY[dx * 2] < width1 && + (unsigned)XY[dx * 2 + 1] < height1 + : !prevInlier; + if (curInlier == prevInlier) + continue; + + int X1 = dx; + dx = X0; + X0 = X1; + prevInlier = curInlier; + + if (!curInlier) { + int len = vecOp(_src, D, XY + dx * 2, FXY + dx, wtab, X1 - dx); + D += len * CH; + dx += len; + + if (CH == 1) { + MIDOUT_BEGIN(remapBilinear_bmode, 0, 1) { + for (; dx < X1; dx++, D++) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + const AT* w = wtab + FXY[dx] * 4; + const T* S = S0 + sy * sstep + sx; + *D = castOp(WT(S[0] * w[0] + S[1] * w[1] + + S[sstep] * w[2] + S[sstep + 1] * w[3])); + } + } + MIDOUT_END(); + } else if (CH == 2) { + MIDOUT_BEGIN(remapBilinear_bmode, 0, 2) { + for (; dx < X1; dx++, D += 2) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + const AT* w = wtab + FXY[dx] * 4; + const T* S = S0 + sy * sstep + sx * 2; + WT t0 = S[0] * w[0] + S[2] * w[1] + S[sstep] * w[2] + + S[sstep + 2] * w[3]; + WT t1 = S[1] * w[0] + S[3] * w[1] + + S[sstep + 1] * w[2] + S[sstep + 3] * w[3]; + D[0] = castOp(t0); + D[1] = castOp(t1); + } + } + MIDOUT_END(); + } else if (CH == 3) + MIDOUT_BEGIN(remapBilinear_bmode, 0, 3) { + for (; dx < X1; dx++, D += 3) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + const AT* w = wtab + FXY[dx] * 4; + const T* S = S0 + sy * sstep + sx * 3; + WT t0 = S[0] * w[0] + S[3] * w[1] + S[sstep] * w[2] + + S[sstep + 3] * w[3]; + WT t1 = S[1] * w[0] + S[4] * w[1] + + S[sstep + 1] * w[2] + S[sstep + 4] * w[3]; + WT t2 = S[2] * w[0] + S[5] * w[1] + + S[sstep + 2] * w[2] + S[sstep + 5] * w[3]; + D[0] = castOp(t0); + D[1] = castOp(t1); + D[2] = castOp(t2); + } + } + MIDOUT_END(); + else + megdnn_throw("nr. of channels must be 1/2/3."); + + } else { + if (bmode == BMode::BORDER_TRANSPARENT && CH != 3) { + megdnn_throw( + "unsupported Linear InterpolationMode" + " with BORDER_TRANSPARENT and channel size 1"); + continue; + } + if (CH == 1) { + MIDOUT_BEGIN(remapBilinear_bmode, 1, 1) { + for (; dx < X1; dx++, D++) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + if (bmode == BMode::BORDER_CONSTANT && + (sx >= swidth || sx + 1 < 0 || sy >= sheight || + sy + 1 < 0)) { + D[0] = bvalue[0]; + } else { + int sx0, sx1, sy0, sy1; + T v0, v1, v2, v3; + const AT* w = wtab + FXY[dx] * 4; + if (bmode == BMode::BORDER_REPLICATE) { + sx0 = saturate(sx, 0, swidth); + sx1 = saturate(sx + 1, 0, swidth); + sy0 = saturate(sy, 0, sheight); + sy1 = saturate(sy + 1, 0, sheight); + v0 = S0[sy0 * sstep + sx0]; + v1 = S0[sy0 * sstep + sx1]; + v2 = S0[sy1 * sstep + sx0]; + v3 = S0[sy1 * sstep + sx1]; + } else { + sx0 = border_interpolate(sx, swidth); + sx1 = border_interpolate(sx + 1, swidth); + sy0 = border_interpolate(sy, sheight); + sy1 = border_interpolate(sy + 1, + sheight); + v0 = sx0 >= 0 && sy0 >= 0 + ? S0[sy0 * sstep + sx0] + : bvalue[0]; + v1 = sx1 >= 0 && sy0 >= 0 + ? S0[sy0 * sstep + sx1] + : bvalue[0]; + v2 = sx0 >= 0 && sy1 >= 0 + ? S0[sy1 * sstep + sx0] + : bvalue[0]; + v3 = sx1 >= 0 && sy1 >= 0 + ? S0[sy1 * sstep + sx1] + : bvalue[0]; + } + D[0] = castOp(WT(v0 * w[0] + v1 * w[1] + v2 * w[2] + + v3 * w[3])); + } + } + } + MIDOUT_END(); + } else { + for (; dx < X1; dx++, D += CH) { + int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; + if (bmode == BMode::BORDER_CONSTANT && + (sx >= swidth || sx + 1 < 0 || sy >= sheight || + sy + 1 < 0)) { + for (size_t k = 0; k < CH; k++) + D[k] = bvalue[k]; + } else { + int sx0, sx1, sy0, sy1; + const T *v0, *v1, *v2, *v3; + const AT* w = wtab + FXY[dx] * 4; + if (bmode == BMode::BORDER_REPLICATE) { + sx0 = saturate(sx, 0, swidth); + sx1 = saturate(sx + 1, 0, swidth); + sy0 = saturate(sy, 0, sheight); + sy1 = saturate(sy + 1, 0, sheight); + v0 = S0 + sy0 * sstep + sx0 * CH; + v1 = S0 + sy0 * sstep + sx1 * CH; + v2 = S0 + sy1 * sstep + sx0 * CH; + v3 = S0 + sy1 * sstep + sx1 * CH; + } else if (bmode == BMode::BORDER_TRANSPARENT && + ((unsigned)sx >= + (unsigned)(swidth - 1) || + (unsigned)sy >= + (unsigned)(sheight - 1))) + continue; + else { + sx0 = border_interpolate(sx, swidth); + sx1 = border_interpolate(sx + 1, swidth); + sy0 = border_interpolate(sy, sheight); + sy1 = border_interpolate(sy + 1, + sheight); + v0 = sx0 >= 0 && sy0 >= 0 + ? S0 + sy0 * sstep + sx0 * CH + : &bvalue[0]; + v1 = sx1 >= 0 && sy0 >= 0 + ? S0 + sy0 * sstep + sx1 * CH + : &bvalue[0]; + v2 = sx0 >= 0 && sy1 >= 0 + ? S0 + sy1 * sstep + sx0 * CH + : &bvalue[0]; + v3 = sx1 >= 0 && sy1 >= 0 + ? S0 + sy1 * sstep + sx1 * CH + : &bvalue[0]; + } + + for (size_t k = 0; k < CH; k++) { + D[k] = castOp(WT(v0[k] * w[0] + v1[k] * w[1] + + v2[k] * w[2] + v3[k] * w[3])); + } + } + } + } + } + } + } + } + MIDOUT_END(); +} + +template +static void remapLanczos4(const Mat& _src, Mat& _dst, + const Mat& _xy, const Mat& _fxy, + const void* _wtab, const T* bvalue) { + typedef typename CastOp::type1 WT; + const AT* wtab = (const AT*)_wtab; + const T* S0 = _src.ptr(); + size_t sstep = _src.step(); + int dx, dy; + CastOp castOp; + int swidth = _src.width(), sheight = _src.height(); + int dwidth = _dst.width(), dheight = _dst.height(); + unsigned width1 = std::max(swidth - 7, 0), + height1 = std::max(sheight - 7, 0); + if (_dst.is_continuous() && _xy.is_continuous() && _fxy.is_continuous()) { + dwidth *= dheight; + dheight = 1; + } + for (dy = 0; dy < dheight; dy++) { + T* D = _dst.ptr(dy); + const short* XY = _xy.ptr(dy); + const ushort* FXY = _fxy.ptr(dy); + for (dx = 0; dx < dwidth; dx++, D += CH) { + int sx = XY[dx * 2] - 3, sy = XY[dx * 2 + 1] - 3; + const AT* w = wtab + FXY[dx] * 64; + const T* S = S0 + sy * sstep + sx * CH; + size_t i, k; + if ((unsigned)sx < width1 && (unsigned)sy < height1) { + for (k = 0; k < CH; k++) { + WT sum = 0; + for (int r = 0; r < 8; r++, S += sstep, w += 8) + sum += S[0] * w[0] + S[CH] * w[1] + S[CH * 2] * w[2] + + S[CH * 3] * w[3] + S[CH * 4] * w[4] + + S[CH * 5] * w[5] + S[CH * 6] * w[6] + + S[CH * 7] * w[7]; + w -= 64; + S -= sstep * 8 - 1; + D[k] = castOp(sum); + } + } else { + int x[8], y[8]; + if (bmode == BMode::BORDER_TRANSPARENT && + ((unsigned)(sx + 3) >= (unsigned)swidth || + (unsigned)(sy + 3) >= (unsigned)sheight)) + continue; + if (bmode == BMode::BORDER_CONSTANT && + (sx >= swidth || sx + 8 <= 0 || sy >= sheight || + sy + 8 <= 0)) { + for (size_t i = 0; i < CH; i++) { + D[i] = bvalue[i]; + } + continue; + } + for (i = 0; i < 8; i++) { + x[i] = border_interpolate(sx + i, swidth) * CH; + y[i] = border_interpolate(sy + i, sheight); + } + for (k = 0; k < CH; k++, S0++, w -= 64) { + WT cv = bvalue[k], sum = cv * ONE; + for (i = 0; i < 8; i++, w += 8) { + int yi = y[i]; + const T* S1 = S0 + yi * sstep; + if (yi < 0) + continue; + if (x[0] >= 0) + sum += (S1[x[0]] - cv) * w[0]; + if (x[1] >= 0) + sum += (S1[x[1]] - cv) * w[1]; + if (x[2] >= 0) + sum += (S1[x[2]] - cv) * w[2]; + if (x[3] >= 0) + sum += (S1[x[3]] - cv) * w[3]; + if (x[4] >= 0) + sum += (S1[x[4]] - cv) * w[4]; + if (x[5] >= 0) + sum += (S1[x[5]] - cv) * w[5]; + if (x[6] >= 0) + sum += (S1[x[6]] - cv) * w[6]; + if (x[7] >= 0) + sum += (S1[x[7]] - cv) * w[7]; + } + D[k] = castOp(sum); + } + S0 -= CH; + } + } + } +} + +template +struct RemapFuncHolder; + +template +struct RemapFuncHolder { + static void get_funcs(RemapNNFunc& nnfunc, + RemapFunc& ifunc) { + switch (imode) { + case IMode::INTER_NEAREST: + MIDOUT_BEGIN(megdnn_warp, midout_iv(0)) { + nnfunc = remapNearest; + } + MIDOUT_END(); + break; + case IMode::INTER_LINEAR: + MIDOUT_BEGIN(megdnn_warp, midout_iv(1)) { + ifunc = remapBilinear< + FixedPtCast, + RemapVec, short, uchar, bmode, CH>; + } + MIDOUT_END(); + break; + case IMode::INTER_CUBIC: + MIDOUT_BEGIN(megdnn_warp, midout_iv(2)) { + ifunc = remapBicubic< + FixedPtCast, + short, INTER_REMAP_COEF_SCALE, uchar, bmode, CH>; + } + MIDOUT_END(); + break; + case IMode::INTER_LANCZOS4: + MIDOUT_BEGIN(megdnn_warp, midout_iv(3)) { + ifunc = remapLanczos4< + FixedPtCast, + short, INTER_REMAP_COEF_SCALE, uchar, bmode, CH>; + } + MIDOUT_END(); + break; + default: + megdnn_throw(("unrecognized interpolation mode")); + } + } +}; + +template +struct RemapFuncHolder { + static void get_funcs(RemapNNFunc& nnfunc, + RemapFunc& ifunc) { + switch (imode) { + case IMode::INTER_NEAREST: + MIDOUT_BEGIN(megdnn_warp, midout_iv(0)) { + nnfunc = remapNearest; + } + MIDOUT_END(); + break; + case IMode::INTER_LINEAR: + MIDOUT_BEGIN(megdnn_warp, midout_iv(1)) { + ifunc = remapBilinear, RemapVec, float, + float, bmode, CH>; + } + MIDOUT_END(); + break; + case IMode::INTER_CUBIC: + MIDOUT_BEGIN(megdnn_warp, midout_iv(2)) { + ifunc = remapBicubic, float, 1, float, + bmode, CH>; + } + MIDOUT_END(); + break; + case IMode::INTER_LANCZOS4: + MIDOUT_BEGIN(megdnn_warp, midout_iv(3)) { + ifunc = remapLanczos4, float, 1, float, + bmode, CH>; + } + MIDOUT_END(); + break; + default: + megdnn_throw(("unrecognized interpolation mode")); + } + } +}; + +template +#if MEGDNN_X86 +MEGDNN_ATTRIBUTE_TARGET("sse3") +#endif +void remap(const Mat& src, Mat& dst, Mat& map1, Mat& map2, + const T* bvalue) { + RemapNNFunc nnfunc = 0; + RemapFunc ifunc = 0; + bool fixpt = std::is_same::value; + const void* ctab = 0; + RemapFuncHolder::get_funcs(nnfunc, ifunc); + if (imode != IMode::INTER_NEAREST) { + ctab = InterpTable::get_table(imode, fixpt); + } + { + // remap invoker + int x, y, x1, y1; + const int buf_size = 1 << 14; + int dstcols = dst.cols(), dstrows = dst.rows(); + int brows0 = std::min(128, dstrows); + int bcols0 = std::min(buf_size / brows0, dstcols); + brows0 = std::min(buf_size / bcols0, dstrows); + Mat _bufxy(brows0, bcols0, 2); + Mat _bufa(brows0, bcols0, 1); + for (y = 0; y < dstrows; y += brows0) + for (x = 0; x < dstcols; x += bcols0) { + int brows = std::min(brows0, dstrows - y); + int bcols = std::min(bcols0, dstcols - x); + Mat dpart(dst, y, brows, x, bcols); + Mat bufxy(_bufxy, 0, brows, 0, bcols); + if (nnfunc) { + bufxy = Mat(map1, y, brows, x, bcols); + nnfunc(src, dpart, bufxy, bvalue); + continue; + } + Mat bufa(_bufa, 0, brows, 0, bcols); + for (y1 = 0; y1 < brows; ++y1) { + ushort* A = bufa.ptr(y1); + bufxy = Mat(map1, y, brows, x, bcols); + const ushort* sA = map2.ptr(y + y1) + x; + x1 = 0; +#if MEGDNN_X86 + __m128i sA_data, d_data; + __m128i v_INTER_TAB_SIZE2 = + _mm_set1_epi16(INTER_TAB_SIZE2 - 1); + + for (; x1 <= bcols - 8; x1 += 8) { + __m128i const* src = (__m128i const*)(sA + x1); + __m128i* dst = (__m128i*)(A + x1); + + sA_data = _mm_loadu_si128(src); + d_data = _mm_and_si128(sA_data, v_INTER_TAB_SIZE2); + _mm_storeu_si128(dst, d_data); + } +#endif + for (; x1 < bcols; ++x1) + A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2 - 1)); + } + ifunc(src, dpart, bufxy, bufa, ctab, bvalue); + } + } +} + +#define DISPATCH_CHANNEL(_imode, _bmode, _ch, _cb) \ + switch (_ch) { \ + case 1: { \ + _cb(_imode, _bmode, 1); \ + break; \ + } \ + case 2: { \ + _cb(_imode, _bmode, 2); \ + break; \ + } \ + case 3: { \ + _cb(_imode, _bmode, 3); \ + break; \ + } \ + default: { \ + megdnn_assert(0, "unsupport channels: %zu, only supprt 1/2/3", \ + _ch); \ + } \ + } + +#define DISPATCH_BMODE(_imode, _bmode, _ch, _cb) \ + switch (_bmode) { \ + case BorderMode::REPLICATE: { \ + DISPATCH_CHANNEL(_imode, BorderMode::REPLICATE, _ch, _cb); \ + break; \ + } \ + case BorderMode::REFLECT: { \ + DISPATCH_CHANNEL(_imode, BorderMode::REFLECT, _ch, _cb); \ + break; \ + } \ + case BorderMode::REFLECT_101: { \ + DISPATCH_CHANNEL(_imode, BorderMode::REFLECT_101, _ch, _cb); \ + break; \ + } \ + case BorderMode::WRAP: { \ + DISPATCH_CHANNEL(_imode, BorderMode::WRAP, _ch, _cb); \ + break; \ + } \ + case BorderMode::CONSTANT: { \ + DISPATCH_CHANNEL(_imode, BorderMode::CONSTANT, _ch, _cb); \ + break; \ + } \ + default: { megdnn_assert(0, "unsupport border mode for cv"); } \ + } + +#define DISPATCH_IMODE(_imode, _bmode, _ch, _cb) \ + switch (_imode) { \ + case InterpolationMode::NEAREST: { \ + DISPATCH_BMODE(InterpolationMode::NEAREST, _bmode, _ch, _cb); \ + break; \ + } \ + case InterpolationMode::LINEAR: { \ + DISPATCH_BMODE(InterpolationMode::LINEAR, _bmode, _ch, _cb); \ + break; \ + } \ + case InterpolationMode::AREA: { \ + DISPATCH_BMODE(InterpolationMode::AREA, _bmode, _ch, _cb); \ + break; \ + } \ + case InterpolationMode::CUBIC: { \ + DISPATCH_BMODE(InterpolationMode::CUBIC, _bmode, _ch, _cb); \ + break; \ + } \ + case InterpolationMode::LANCZOS4: { \ + DISPATCH_BMODE(InterpolationMode::LANCZOS4, _bmode, _ch, _cb); \ + break; \ + } \ + default: { megdnn_assert(0, "unsupport interpolation mode for cv"); } \ + } + +} // namespace warp +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/warp_perspective.cpp b/dnn/src/common/warp_perspective.cpp new file mode 100644 index 00000000..83a4624f --- /dev/null +++ b/dnn/src/common/warp_perspective.cpp @@ -0,0 +1,285 @@ +/** + * \file dnn/src/common/warp_perspective.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include "src/common/utils.h" + +namespace megdnn { + +void WarpPerspectiveBase::check_layout_fwd(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &mat_idx, + const TensorLayout &dst) +{ + megdnn_assert_contiguous(mat); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(dst); + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + + megdnn_layout_msg(mat) + ", " + + megdnn_layout_msg(mat_idx) + ", " + + megdnn_layout_msg(dst) + ", " + + param_msg(); + }; + MEGDNN_MARK_USED_VAR(errmsg); + if (param().format == param::WarpPerspective::Format::NHWCD4 || + param().format == param::WarpPerspective::Format::NCHW4) { + megdnn_assert(src.ndim == 5_z, "%s", errmsg().c_str()); + megdnn_assert(dst.ndim == 5_z, "%s", errmsg().c_str()); + + } else { + megdnn_assert(param().format == param::WarpPerspective::Format::NHWC || + param().format == param::WarpPerspective::Format::NCHW); + megdnn_assert(src.ndim == 4_z, "%s", errmsg().c_str()); + megdnn_assert(dst.ndim == 4_z, "%s", errmsg().c_str()); + } + megdnn_assert(mat.ndim == 3_z, "%s", errmsg().c_str()); + megdnn_assert(dst.shape[0] == mat.shape[0], "%s", errmsg().c_str()); + if (mat_idx.ndim) { + megdnn_assert(mat_idx.dtype == dtype::Int32() && mat_idx.ndim == 1, + "%s", errmsg().c_str()); + megdnn_assert(mat.shape[0] == mat_idx.shape[0], "%s", errmsg().c_str()); + megdnn_assert_contiguous(mat_idx); + } else { + megdnn_assert(src.shape[0] == dst.shape[0], "%s", errmsg().c_str()); + } + megdnn_assert(mat.shape[1] == 3_z, "%s", errmsg().c_str()); + megdnn_assert(mat.shape[2] == 3_z, "%s", errmsg().c_str()); + + if (param().format == param::WarpPerspective::Format::NCHW) { + megdnn_assert(src.dtype.enumv() == DTypeEnum::Float32 || + MEGDNN_FLOAT16_SELECT( + src.dtype.enumv() == DTypeEnum::Float16, + false) || + src.dtype.enumv() == DTypeEnum::Int8 || + src.dtype.enumv() == DTypeEnum::Uint8 || + (src.dtype.enumv() == DTypeEnum::QuantizedS8 || + src.dtype.enumv() == DTypeEnum::Quantized8Asymm), + "WarpPerspective NCHW input dtype should be " + "Float32/Int8/Uint8/QInt8/QUint8" MEGDNN_FLOAT16_SELECT( + "/Float16", "") "."); + megdnn_assert( + (src.dtype.category() == DTypeCategory::FLOAT && + (src.dtype == mat.dtype || + mat.dtype.enumv() == DTypeEnum::Float32)) || + ((src.dtype.category() == DTypeCategory::INT || + src.dtype.category() == DTypeCategory::QUANTIZED) && + mat.dtype.enumv() == DTypeEnum::Float32), + "The input to WarpPerspective is in NCHW format, in this " + "case, if the input dtype is floating point, the " + "transformation matrix should have same dtype as the " + "input, otherwise, it should be in Float32, %s given.", + mat.dtype.name()); + + megdnn_assert(dst.dtype == src.dtype); + megdnn_assert(src.shape[1] == dst.shape[1], "%s", errmsg().c_str()); + + megdnn_assert(param().imode == + param::WarpPerspective::InterpolationMode::LINEAR); + megdnn_assert(param().bmode != + param::WarpPerspective::BorderMode::TRANSPARENT); + megdnn_assert(param().bmode != + param::WarpPerspective::BorderMode::ISOLATED); + } else if (param().format == param::WarpPerspective::Format::NHWC) { + megdnn_assert(src.shape[3] == dst.shape[3], "%s", errmsg().c_str()); + } else if (param().format == param::WarpPerspective::Format::NCHW4) { + megdnn_assert(dst.dtype == src.dtype); + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8, + "src expected QuantizedS8, but got %s", src.dtype.name()); + megdnn_assert(mat.dtype == dtype::Float32(), + "matrix dtype expected float, got %s", mat.dtype.name()); + megdnn_assert(src.shape[4] == 4 && dst.shape[4] == 4); + megdnn_assert(src.shape[1] == dst.shape[1], "%s", errmsg().c_str()); + + megdnn_assert(param().imode == + param::WarpPerspective::InterpolationMode::LINEAR); + megdnn_assert(param().bmode != + param::WarpPerspective::BorderMode::TRANSPARENT); + megdnn_assert(param().bmode != + param::WarpPerspective::BorderMode::ISOLATED); + } else { + megdnn_assert(param().format == param::WarpPerspective::Format::NHWCD4); + megdnn_assert(src.dtype == dtype::Float32() || + MEGDNN_FLOAT16_SELECT( + src.dtype == dtype::Float16(), false) || + src.dtype.enumv() == DTypeEnum::QuantizedS8 || + src.dtype.enumv() == DTypeEnum::Quantized8Asymm, + "WarpPerspective NHWCD4 input dtype should be " + "Float32" MEGDNN_FLOAT16_SELECT( + "/Float16", "") ",QunatizedS8, Quantized8Asymm."); + megdnn_assert( + (src.dtype == mat.dtype || mat.dtype == dtype::Float32()), + "The input to WarpPerspective is in NHWCD4 format, in this " + "case, if the input dtype is floating point, the " + "transformation matrix should have same dtype as the " + "input, %s given.", + mat.dtype.name()); + megdnn_assert(dst.dtype == src.dtype); + //! number of channels is same + megdnn_assert(src.shape[2] == dst.shape[2], "%s", errmsg().c_str()); + megdnn_assert(param().imode == + param::WarpPerspective::InterpolationMode::LINEAR); + megdnn_assert(param().bmode != + param::WarpPerspective::BorderMode::TRANSPARENT); + megdnn_assert(param().bmode != + param::WarpPerspective::BorderMode::ISOLATED); + } + megdnn_assert(src.format == dst.format); +} + +std::string WarpPerspectiveBase::param_msg() const +{ + std::string res; + res.append(megdnn_mangle("imode=")); + switch (param().imode) { + case InterpolationMode::NEAREST: + res.append(megdnn_mangle("NEAREST")); + break; + case InterpolationMode::LINEAR: + res.append(megdnn_mangle("LINEAR")); + break; + case InterpolationMode::AREA: + res.append(megdnn_mangle("AREA")); + break; + case InterpolationMode::CUBIC: + res.append(megdnn_mangle("CUBIC")); + break; + case InterpolationMode::LANCZOS4: + res.append(megdnn_mangle("LANCZOS4")); + break; + } + res.append(megdnn_mangle("bmode=")); + switch (param().bmode) { + case BorderMode::WRAP: + res.append(megdnn_mangle("WRAP")); + break; + case BorderMode::CONSTANT: + res.append(megdnn_mangle("CONSTANT")); + break; + case BorderMode::REFLECT: + res.append(megdnn_mangle("REFLECT")); + break; + case BorderMode::REFLECT_101: + res.append(megdnn_mangle("REFLECT_101")); + break; + case BorderMode::REPLICATE: + res.append(megdnn_mangle("REPLICATE")); + break; + case BorderMode::TRANSPARENT: + res.append(megdnn_mangle("TRANSPARENT")); + break; + case BorderMode::ISOLATED: + res.append(megdnn_mangle("ISOLATED")); + break; + } + if (param().bmode == BorderMode::CONSTANT) { + res.append(", " + std::to_string(param().border_val)); + } + return res; +} + +int WarpPerspectiveBase::get_real_coord(int p, int len) +{ + auto bmode = param().bmode; + if( (unsigned)p < (unsigned)len ) + ; + else if( bmode == BorderMode::REPLICATE ) + p = p < 0 ? 0 : len - 1; + else if( bmode == BorderMode::REFLECT || bmode == BorderMode::REFLECT_101 ) + { + int delta = (bmode == BorderMode::REFLECT_101); + if( len == 1 ) + return 0; + do + { + if( p < 0 ) + p = -p - 1 + delta; + else + p = len - 1 - (p - len) - delta; + } + while( (unsigned)p >= (unsigned)len ); + } + else if( bmode == BorderMode::WRAP ) + { + if( p < 0 ) + p -= ((p-len+1)/len)*len; + /* + if( p >= len ) + p %= len; + */ + while (p >= len) { + p -= len; + } + } + else if( bmode == BorderMode::CONSTANT ) + p = -1; + return p; +} + +void WarpPerspectiveForward::check_exec(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &mat_idx, + const TensorLayout &dst, + size_t workspace_in_bytes) +{ + check_exec_allow_nhwc_mat_idx(src, mat, mat_idx, dst, workspace_in_bytes); + if (param().format == Param::Format::NHWC) { + megdnn_assert(!mat_idx.ndim, + "mat_idx not supported for current format"); + } +} + +void WarpPerspectiveForward::check_exec_allow_nhwc_mat_idx( + const TensorLayout& src, const TensorLayout& mat, + const TensorLayout& mat_idx, const TensorLayout& dst, + size_t workspace_in_bytes) { + check_layout_fwd(src, mat, mat_idx, dst); + auto required_workspace_in_bytes = + get_workspace_in_bytes(src, mat, mat_idx, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + if (param().format != Param::Format::NHWC && + param().format != Param::Format::NCHW && + param().format != Param::Format::NCHW4) { + megdnn_assert(!mat_idx.ndim, + "mat_idx not supported for current format"); + } +} + +void WarpPerspectiveBackwardData::check_exec(const TensorLayout &mat, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(grad, mat, diff); + megdnn_assert(grad.dtype == dtype::Float32(), + "Backward WarpPerspective only supports Float32."); + auto required_workspace_in_bytes = get_workspace_in_bytes(mat, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +void WarpPerspectiveBackwardMat::check_exec(const TensorLayout &src, + const TensorLayout &mat, + const TensorLayout &diff, + const TensorLayout &grad, + size_t workspace_in_bytes) +{ + check_layout_fwd(src, mat, diff); + megdnn_assert_eq_layout(mat, grad); + megdnn_assert(grad.dtype == dtype::Float32(), + "Backward WarpPerspective only supports Float32."); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, + mat, diff, grad); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/warp_perspective_helper.cpp b/dnn/src/common/warp_perspective_helper.cpp new file mode 100644 index 00000000..7ddb0be1 --- /dev/null +++ b/dnn/src/common/warp_perspective_helper.cpp @@ -0,0 +1,37 @@ +/** + * \file dnn/src/common/warp_perspective_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./warp_perspective_helper.h" + +using namespace megdnn; +bool warp_perspective::is_cv_available(const TensorLayout& src, + const TensorLayout& /*mat*/, + const TensorLayout& mat_idx, + const TensorLayout& /*dst*/, + Param param) { + return param.format == Param::Format::NHWC && + (src[3] == 1 || src[3] == 3) && !mat_idx.ndim && + (src.dtype == dtype::Float32() || src.dtype == dtype::Uint8()) && + (param.imode == Param::InterpolationMode::NEAREST || + param.imode == Param::InterpolationMode::LINEAR || + param.imode == Param::InterpolationMode::CUBIC || + param.imode == Param::InterpolationMode::LANCZOS4); +} + +bool warp_perspective::is_dnn_available(const TensorLayout& /*src*/, + const TensorLayout& /*mat*/, + const TensorLayout& /*mat_idx*/, + const TensorLayout& /*dst*/, + Param param) { + return param.imode == Param::InterpolationMode::LINEAR; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/warp_perspective_helper.h b/dnn/src/common/warp_perspective_helper.h new file mode 100644 index 00000000..70f22be7 --- /dev/null +++ b/dnn/src/common/warp_perspective_helper.h @@ -0,0 +1,26 @@ +/** + * \file dnn/src/common/warp_perspective_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs.h" + +namespace megdnn { +namespace warp_perspective { +using Param = param::WarpPerspective; +bool is_cv_available(const TensorLayout& src, const TensorLayout& mat, + const TensorLayout& mat_idx, const TensorLayout& dst, + Param param); +bool is_dnn_available(const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, Param param); +} // namespace warp_perspective +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/winograd/winograd_generator.cpp b/dnn/src/common/winograd/winograd_generator.cpp new file mode 100644 index 00000000..b3c28afc --- /dev/null +++ b/dnn/src/common/winograd/winograd_generator.cpp @@ -0,0 +1,277 @@ +/** + * Copyright (c) 2018, Alibaba Group Holding Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + *---------------------------------------------------------------------------- + * + * \file dnn/src/common/winograd/winograd_generator.cpp + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * ---------------------------------------------------------------- + */ + +#include "src/common/winograd/winograd_generator.h" +#include "src/common/utils.h" +#include +#include +#include + +using namespace megdnn; +using namespace winograd; + +namespace { + +WinogradGenerator::Matrix computeA(const std::vector& a, int m, int n) { + WinogradGenerator::Matrix res(n, m); + for (int y = 0; y < n; ++y) { + for (int x = 0; x < m - 1; ++x) { + if (x == 0 && y == 0) { + res.at(y, x) = 1.0f; + } else { + res.at(y, x) = ::powf(a[x], (float)y); + } + } + if (y == n - 1) { + res.at(y, m - 1) = 1.0f; + } else { + res.at(y, m - 1) = 0.0f; + } + } + return res; +} + +WinogradGenerator::Matrix computeF(const std::vector& a, int alpha) { + WinogradGenerator::Matrix res(1, alpha); + for (int x = 0; x < alpha; ++x) { + float product = 1.0f; + for (int i = 0; i < alpha; ++i) { + if (x == i) { + continue; + } + product *= (a[x] - a[i]); + } + res.at(0, x) = product; + } + return res; +} + +WinogradGenerator::Matrix computeT(const std::vector& a, int n) { + WinogradGenerator::Matrix res(n, n + 1); + for (int y = 0; y < n; ++y) { + auto line = res.data() + res.cols() * y; + std::memset(line, 0, res.cols() * sizeof(float)); + line[y] = 1.0f; + line[n] = -::powf(a[y], (float)n); + } + return res; +} + +WinogradGenerator::Matrix computeL(const std::vector& a, int n) { + megdnn_assert(n >= 1); + WinogradGenerator::Matrix res(n, n); + for (int k = 0; k < n; ++k) { + WinogradGenerator::Matrix p(1, 1); + p.at(0, 0) = 1.0f; + WinogradGenerator::Matrix p2(1, 2); + for (int i = 0; i < n; ++i) { + if (i == k) { + continue; + } + p2.at(0, 0) = -a[i]; + p2.at(0, 1) = 1.0f; + p = p.poly_multi(p2); + } + std::memcpy(res.data() + res.cols() * k, p.data(), n * sizeof(float)); + } + return res; +} + +WinogradGenerator::Matrix computeB(const std::vector& a, int alpha) { + WinogradGenerator::Matrix res; + auto L = computeL(a, alpha - 1); + auto fdiag = computeF(a, alpha - 1); + L.div_per_line(fdiag); + + L.transpose(); + + auto T = computeT(a, alpha - 1); + WinogradGenerator::Matrix BT = L.mul(T); + + WinogradGenerator::Matrix B(alpha, alpha); + for (int y = 0; y < alpha - 1; ++y) { + std::memcpy(B.data() + B.cols() * y, BT.data() + BT.cols() * y, + alpha * sizeof(float)); + } + for (int x = 0; x < alpha - 1; ++x) { + B.at(alpha - 1, x) = 0; + } + B.at(alpha - 1, alpha - 1) = 1.0f; + + return B; +} + +WinogradGenerator::Matrix computeFPlusOne(const std::vector& a, + int alpha) { + auto fdiag = computeF(a, alpha - 1); + WinogradGenerator::Matrix res(1, alpha); + for (int i = 0; i < alpha - 1; i++) { + res.at(0, i) = fdiag.at(0, i); + } + res.at(0, alpha - 1) = 1; + //! change sign if res[0, 0] < 0 + res.at(0, 0) = std::abs(res.at(0, 0)); + + return res; +} + +} // namespace + +float& WinogradGenerator::Matrix::at(size_t row, size_t col) { + return m_data[row * m_cols + col]; +} + +const float& WinogradGenerator::Matrix::at(size_t row, size_t col) const { + return m_data[row * m_cols + col]; +} + +void WinogradGenerator::Matrix::transpose() { + WinogradGenerator::Matrix res(m_cols, m_rows); + for (size_t r = 0; r < m_rows; r++) { + for (size_t c = 0; c < m_cols; c++) { + res.at(c, r) = m_data[r * m_cols + c]; + } + } + *this = std::move(res); +} + +void WinogradGenerator::Matrix::print(const char* msg) const { + printf("%s\n", msg); + + for (size_t y = 0; y < m_rows; ++y) { + for (size_t x = 0; x < m_cols; ++x) { + printf("%.7f\t", at(y, x)); + } + printf("\n"); + } +} + +WinogradGenerator::Matrix WinogradGenerator::Matrix::mul(const Matrix& rhs) { + WinogradGenerator::Matrix res(rows(), rhs.cols()); + for (size_t r = 0; r < res.rows(); r++) { + for (size_t c = 0; c < res.cols(); c++) { + res.at(r, c) = 0.f; + for (size_t k = 0; k < cols(); k++) { + res.at(r, c) += at(r, k) * rhs.at(k, c); + } + } + } + std::swap(m_rows, m_cols); + return res; +} + +WinogradGenerator::Matrix WinogradGenerator::Matrix::poly_multi( + const Matrix& B) { + megdnn_assert(rows() == 1 && B.rows() == 1); + auto aw = cols(); + auto bw = B.cols(); + + WinogradGenerator::Matrix res(1, aw + bw - 1); + + for (size_t i = 0; i < aw + bw - 1; ++i) { + res.at(0, i) = 0.0f; + } + for (size_t y = 0; y < bw; ++y) { + auto bValue = B.at(0, y); + for (size_t x = 0; x < aw; ++x) { + auto aValue = this->at(0, x); + res.at(0, x + y) += bValue * aValue; + } + } + return res; +} + +void WinogradGenerator::Matrix::div_per_line( + const WinogradGenerator::Matrix& line) { + megdnn_assert(line.rows() == 1 && line.cols() >= m_rows); + + for (size_t y = 0; y < m_rows; ++y) { + for (size_t x = 0; x < m_cols; ++x) { + at(y, x) /= line.at(0, y); + } + } +} + +void WinogradGenerator::Matrix::mul_per_row( + const WinogradGenerator::Matrix& line) { + megdnn_assert(line.rows() == 1 && line.cols() >= m_cols); + for (size_t y = 0; y < m_rows; ++y) { + for (size_t x = 0; x < m_cols; ++x) { + at(y, x) *= line.at(0, x); + } + } +} + + + +WinogradGenerator::WinogradGenerator(size_t m, size_t r, float interp) { + size_t alpha = m + r - 1; + + std::vector a(alpha); + a[0] = 0.0f; + int sign = 1; + for (size_t i = 0; i < alpha - 1; ++i) { + int value = 1 + i / 2; + a[i + 1] = sign * value * interp; + sign *= -1; + } + + generate(m, r, a); +} + +WinogradGenerator::WinogradGenerator(size_t m, size_t r, + const std::vector& interp_points) { + megdnn_assert(interp_points.size() == m + r - 2, + "interp_points should be %zu, but got: %zu", m + r - 2, + interp_points.size()); + + generate(m, r, interp_points); +} + +void WinogradGenerator::generate(size_t m, size_t r, + const std::vector& interp_points) { + size_t alpha = m + r - 1; + m_A = computeA(interp_points, alpha, m); + m_A.transpose(); + + auto fdiag = computeFPlusOne(interp_points, alpha); + + m_G = computeA(interp_points, alpha, r); + m_G.transpose(); + m_G.div_per_line(fdiag); + + m_B = computeB(interp_points, alpha); + m_B.mul_per_row(fdiag); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/winograd/winograd_generator.h b/dnn/src/common/winograd/winograd_generator.h new file mode 100644 index 00000000..c70417ed --- /dev/null +++ b/dnn/src/common/winograd/winograd_generator.h @@ -0,0 +1,165 @@ +/** + * Copyright (c) 2018, Alibaba Group Holding Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * ---------------------------------------------------------------- + * + * \file dnn/src/common/winograd/winograd_generator.h + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * ---------------------------------------------------------------- + */ + +#pragma once +#include +#include +#include +#include "src/common/utils.h" + +namespace megdnn { +namespace winograd { + +/** + * \brief generator winograd matrix, A/B/G + */ +class WinogradGenerator { +public: + WinogradGenerator(size_t m, size_t r, float interp = 0.5f); + WinogradGenerator(size_t m, size_t r, + const std::vector& interp_points); + ~WinogradGenerator() = default; + + class Matrix { + public: + Matrix(size_t rows, size_t cols) : m_rows{rows}, m_cols{cols} { + m_data.resize(rows * cols); + } + Matrix() = default; + Matrix(Matrix&& rhs) { + m_data = std::move(rhs.m_data); + m_rows = rhs.m_rows; + m_cols = rhs.m_cols; + } + Matrix& operator=(Matrix&& rhs) { + m_data = std::move(rhs.m_data); + m_rows = rhs.m_rows; + m_cols = rhs.m_cols; + return *this; + } + + Matrix(const Matrix& rhs) { + m_data = rhs.m_data; + m_rows = rhs.m_rows; + m_cols = rhs.m_cols; + } + Matrix& operator=(const Matrix& rhs) { + m_data = rhs.m_data; + m_rows = rhs.m_rows; + m_cols = rhs.m_cols; + return *this; + } + + size_t rows() const { return m_rows; } + size_t cols() const { return m_cols; } + + float& at(size_t row, size_t col); + const float& at(size_t row, size_t col) const; + float* data() { return m_data.data(); } + const float* data() const { return m_data.data(); } + + void transpose(); + void div_per_line(const Matrix& line); + Matrix mul(const Matrix& rhs); + void mul_per_row(const Matrix& line); + Matrix poly_multi(const Matrix& rhs); + void print(const char* msg) const; + + private: + std::vector m_data; + size_t m_rows; + size_t m_cols; + }; + + const Matrix& A() const { return m_A; } + const Matrix& B() const { return m_B; } + const Matrix& G() const { return m_G; } + +private: + void generate(size_t m, size_t r, const std::vector& interp_points); + Matrix m_A; + Matrix m_G; + Matrix m_B; +}; + +/////////////////////// WinogradCoeff //////////////////////////// +/** + * \brief Contains the winograd coeff + */ +template +class WinogradCoeff { + std::unique_ptr m_generator; + + std::vector generate(float rescale, + const WinogradGenerator::Matrix& m) { + std::vector ret; + for (size_t r = 0; r < m.rows(); r++) { + for (size_t c = 0; c < m.cols(); c++) { + float val = m.at(r, c) * rescale; + if (std::is_integral::value) { + megdnn_assert( + std::abs(val - std::round(val)) < 1e-4, + "invalid rescale args, %f(item) * %f(rescale) is " + "not near %f\n", + m.at(r, c), rescale, std::round(val)); + ret.push_back(static_cast(std::round(val))); + } else { + ret.push_back(static_cast(val)); + } + } + } + return ret; + } + +public: + WinogradCoeff(size_t m, size_t r, const std::vector& interp_points) { + m_generator = std::make_unique(m, r, interp_points); + } + + std::vector A(float rescale) { + return generate(rescale, m_generator->A()); + } + + std::vector B(float rescale) { + return generate(rescale, m_generator->B()); + } + + std::vector G(float rescale) { + return generate(rescale, m_generator->G()); + } +}; + +} // namespace winograd +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/winograd/winograd_helper.cpp b/dnn/src/common/winograd/winograd_helper.cpp new file mode 100644 index 00000000..6f1dcdd5 --- /dev/null +++ b/dnn/src/common/winograd/winograd_helper.cpp @@ -0,0 +1,662 @@ +/** + * \file dnn/src/common/winograd/winograd_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/winograd/winograd_helper.h" +#include "src/common/winograd/winograd_generator.h" +#include "src/naive/matrix_mul/matrix_mul_helper.h" + +using namespace megdnn; +namespace { +template +struct Getter { + Getter(DType){}; + otype operator()(ctype item) { return item; } +}; + +template +struct Getter::value>> { + otype zp; + Getter(DType dtype) { + zp = dtype.param().zero_point; + } + otype operator()(ctype item) { return static_cast(item) - zp; } +}; + +template +struct OutputGetter { + OutputGetter(DType){}; + otype operator()(float item) { return static_cast(item); } +}; + +template +struct OutputGetter< + ctype, otype, + typename std::enable_if_t::value>> { + DType dtype; + OutputGetter(DType dtype) : dtype{dtype} {} + otype operator()(float item) { + return dtype.param().quantize(item).as_int8(); + } +}; + +template +struct OutputGetter< + ctype, otype, + typename std::enable_if_t::value>> { + DType dtype; + OutputGetter(DType dtype) : dtype{dtype} {} + otype operator()(float item) { + return dtype.param().quantize(item).as_uint8(); + } +}; + +} // namespace + +namespace megdnn { +namespace winograd { + +template +class StrategyHelper { +public: + static void filter(const ctype* filter, + input_filter_compute_type* filter_transform_buf, + input_filter_compute_type* transform_mid_buf, size_t OC, + size_t IC, size_t oc_start, size_t oc_end, size_t m, + size_t r, const std::vector& interp_points, + DType dtype, float rescale) { + size_t alpha = m + r - 1; + WinogradCoeff winograd_coeff(m, r, + interp_points); + + input_filter_compute_type* mid_buf1 = transform_mid_buf; + input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha; + + Getter getter(dtype); + for (size_t oc = oc_start; oc < oc_end; oc++) { + rep(ic, IC) { + const ctype* filter_ptr = filter + (oc * IC + ic) * r * r; + rep(i, r) rep(j, r) { + mid_buf1[i * r + j] = getter(filter_ptr[i * r + j]); + } + + /* tmp = Matmul(G, src) */ + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2, + alpha, r, r, r, r, r, dtype, dtype); + /* dst = Matmul(tmp, G^T) */ + megdnn::naive::run_matrix_mul_tpl( + mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1, + alpha, alpha, r, r, r, alpha, dtype, dtype); + + rep(i, alpha) rep(j, alpha) { + filter_transform_buf[(i * alpha + j) * OC * IC + ic * OC + + oc] = mid_buf1[i * alpha + j]; + } + } + } + } + + static void input(const ctype* input, + input_filter_compute_type* input_transform_buf, + input_filter_compute_type* transform_mid_buf, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t IC, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float rescale) { + size_t alpha = m + r - 1; + Getter getter(dtype); + WinogradCoeff winograd_coeff(m, r, + interp_points); + rep(ic, IC) { + input_filter_compute_type* mid_buf1 = transform_mid_buf; + input_filter_compute_type* mid_buf2 = + transform_mid_buf + alpha * alpha; + + memset(mid_buf1, 0, + alpha * alpha * sizeof(input_filter_compute_type)); + rep(i, alpha) rep(j, alpha) { + int ih = ih_start + i; + int iw = iw_start + j; + if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) { + mid_buf1[i * alpha + j] = + getter(input[ic * IH * IW + ih * IW + iw]); + } + } + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha, + alpha, alpha, alpha, alpha, alpha, dtype, dtype); + megdnn::naive::run_matrix_mul_tpl( + mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha, + alpha, alpha, alpha, alpha, alpha, dtype, dtype); + rep(i, alpha) rep(j, alpha) { + input_transform_buf[(i * alpha + j) * nr_units_in_tile * IC + + unit_idx * IC + ic] = + mid_buf1[i * alpha + j]; + } + } + } + + static void output(const output_compute_type* output_transform_buf, + const output_compute_type* bias, dst_type* output, + output_compute_type* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t oh_start, + size_t ow_start, size_t OH, size_t OW, size_t oc_start, + size_t oc_end, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float input_filter_scale, float input_filter_rescale, + float rescale) { + size_t alpha = m + r - 1; + size_t OC = oc_end - oc_start; + + OutputGetter getter(dtype); + winograd::WinogradCoeff winograd_coeff( + m, r, interp_points); + for (size_t oc = oc_start; oc < oc_end; oc++) { + size_t oc_index = oc - oc_start; + output_compute_type* mid_buf1 = transform_mid_buf; + output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha; + + // gather + rep(i, alpha) rep(j, alpha) { + mid_buf1[i * alpha + j] = + output_transform_buf[(i * alpha + j) * + nr_units_in_tile * OC + + unit_idx * OC + oc_index]; + } + /* A[alpha*m] M[alpha*alpha] */ + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m, + alpha, alpha, m, alpha, alpha, dtype, dtype); + megdnn::naive::run_matrix_mul_tpl< + output_compute_type, output_compute_type, false, false>( + mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m, + alpha, alpha, m, m, dtype, dtype); + rep(i, m) rep(j, m) { + auto oh = oh_start + i; + auto ow = ow_start + j; + if (oh < OH && ow < OW) { + float val = mid_buf1[i * m + j]; + if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { + val += bias[oc] * input_filter_rescale * + input_filter_rescale; + } else if (bmode == BiasMode::BIAS) { + val += bias[oc * OH * OW + oh * OW + ow] * + input_filter_rescale * input_filter_rescale; + } + val = val * input_filter_scale / + (input_filter_rescale * input_filter_rescale * + rescale * rescale); + if (nonline_mode == NonlineMode::RELU) { + val = val > 0 ? val : 0; + } else if (nonline_mode == NonlineMode::SIGMOID) { + val = 1.f / (expf(-val) + 1.f); + } else if (nonline_mode == NonlineMode::H_SWISH) { + val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f; + } else { + megdnn_assert(nonline_mode == NonlineMode::IDENTITY); + } + + output[oc * OH * OW + oh * OW + ow] = getter(val); + } + } + } + } +}; + +template +class StrategyHelper< + ctype, dst_type, input_filter_compute_type, output_compute_type, format, + std::enable_if_t> { +public: + static void filter(const ctype* filter, + input_filter_compute_type* filter_transform_buf, + input_filter_compute_type* transform_mid_buf, size_t OC, + size_t IC, size_t oc_start, size_t oc_end, size_t m, + size_t r, const std::vector& interp_points, + DType dtype, float rescale) { + size_t alpha = m + r - 1; + WinogradCoeff winograd_coeff(m, r, + interp_points); + + input_filter_compute_type* mid_buf1 = transform_mid_buf; + input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha; + + Getter getter(dtype); + size_t OCB = OC / pack_size; + size_t ICB = IC / pack_size; + for (size_t oc = oc_start; oc < oc_end; oc++) { + rep(ic, IC) { + const ctype* filter_ptr = filter + (oc * IC + ic) * r * r; + rep(i, r) rep(j, r) { + mid_buf1[i * r + j] = getter(filter_ptr[i * r + j]); + } + + /* tmp = Matmul(G, src) */ + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2, + alpha, r, r, r, r, r, dtype, dtype); + /* dst = Matmul(tmp, G^T) */ + megdnn::naive::run_matrix_mul_tpl( + mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1, + alpha, alpha, r, r, r, alpha, dtype, dtype); + + size_t ocb = oc / pack_size; + size_t oc_pack = oc % pack_size; + size_t icb = ic / pack_size; + size_t ic_pack = ic % pack_size; + rep(i, alpha) rep(j, alpha) { + filter_transform_buf[(i * alpha + j) * OCB * ICB * + pack_size * pack_size + + ocb * ICB * pack_size * pack_size + + icb * pack_size * pack_size + + ic_pack * pack_size + oc_pack] = + mid_buf1[i * alpha + j]; + } + } + } + } + + static void input(const ctype* input, + input_filter_compute_type* input_transform_buf, + input_filter_compute_type* transform_mid_buf, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t IC, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float rescale) { + size_t alpha = m + r - 1; + Getter getter(dtype); + WinogradCoeff winograd_coeff(m, r, + interp_points); + size_t ICB = IC / pack_size; + rep(ic, IC) { + input_filter_compute_type* mid_buf1 = transform_mid_buf; + input_filter_compute_type* mid_buf2 = + transform_mid_buf + alpha * alpha; + + memset(mid_buf1, 0, + alpha * alpha * sizeof(input_filter_compute_type)); + rep(i, alpha) rep(j, alpha) { + int ih = ih_start + i; + int iw = iw_start + j; + if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) { + mid_buf1[i * alpha + j] = + getter(input[ic * IH * IW + ih * IW + iw]); + } + } + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha, + alpha, alpha, alpha, alpha, alpha, dtype, dtype); + megdnn::naive::run_matrix_mul_tpl( + mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha, + alpha, alpha, alpha, alpha, alpha, dtype, dtype); + size_t icb = ic / pack_size; + size_t ic_pack = ic % pack_size; + rep(i, alpha) rep(j, alpha) { + input_transform_buf[(i * alpha + j) * ICB * nr_units_in_tile * + pack_size + + icb * nr_units_in_tile * pack_size + + unit_idx * pack_size + ic_pack] = + mid_buf1[i * alpha + j]; + } + } + } + + static void output(const output_compute_type* output_transform_buf, + const output_compute_type* bias, dst_type* output, + output_compute_type* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t oh_start, + size_t ow_start, size_t OH, size_t OW, size_t oc_start, + size_t oc_end, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float input_filter_scale, float input_filter_rescale, + float rescale) { + size_t alpha = m + r - 1; + size_t OC = oc_end - oc_start; + + OutputGetter getter(dtype); + winograd::WinogradCoeff winograd_coeff( + m, r, interp_points); + size_t OCB = OC / pack_size; + for (size_t oc = oc_start; oc < oc_end; oc++) { + output_compute_type* mid_buf1 = transform_mid_buf; + output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha; + + size_t ocb = (oc - oc_start) / pack_size; + size_t oc_pack = oc % pack_size; + // gather + rep(i, alpha) rep(j, alpha) { + mid_buf1[i * alpha + j] = output_transform_buf + [(i * alpha + j) * OCB * nr_units_in_tile * pack_size + + ocb * nr_units_in_tile * pack_size + + unit_idx * pack_size + oc_pack]; + } + /* A[alpha*m] M[alpha*alpha] */ + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m, + alpha, alpha, m, alpha, alpha, dtype, dtype); + megdnn::naive::run_matrix_mul_tpl< + output_compute_type, output_compute_type, false, false>( + mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m, + alpha, alpha, m, m, dtype, dtype); + rep(i, m) rep(j, m) { + auto oh = oh_start + i; + auto ow = ow_start + j; + if (oh < OH && ow < OW) { + float val = mid_buf1[i * m + j]; + if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { + val += bias[oc] * input_filter_rescale * + input_filter_rescale; + } else if (bmode == BiasMode::BIAS) { + val += bias[oc * OH * OW + oh * OW + ow] * + input_filter_rescale * input_filter_rescale; + } + val = val * input_filter_scale / + (input_filter_rescale * input_filter_rescale * + rescale * rescale); + if (nonline_mode == NonlineMode::RELU) { + val = val > 0 ? val : 0; + } else if (nonline_mode == NonlineMode::SIGMOID) { + val = 1.f / (expf(-val) + 1.f); + } else if (nonline_mode == NonlineMode::H_SWISH) { + val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f; + } else { + megdnn_assert(nonline_mode == NonlineMode::IDENTITY); + } + + output[oc * OH * OW + oh * OW + ow] = getter(val); + } + } + } + } + + static size_t pack_size; +}; + +template +size_t StrategyHelper< + ctype, dst_type, input_filter_compute_type, output_compute_type, format, + std::enable_if_t>::pack_size = + MatrixMulForward::pack_size(format); + +#define INST(_ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type) \ + template class StrategyHelper< \ + _ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type, param::MatrixMul::Format::DEFAULT>; + +INST(float, float, float, float) +MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16)) +INST(int8_t, int8_t, int16_t, int) +INST(uint8_t, uint8_t, int16_t, int) +#undef INST + +#define INST(_ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type) \ + template class StrategyHelper< \ + _ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type, param::MatrixMul::Format::MK4>; +INST(float, float, float, float) +#undef INST + +#define INST(_ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type) \ + template class StrategyHelper< \ + _ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type, param::MatrixMul::Format::MK8>; +INST(int8_t, int8_t, int16_t, int) +MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16)) +#undef INST + +template +class StrategyHelperNchwxx< + ctype, dst_type, input_filter_compute_type, output_compute_type, format, + std::enable_if_t> { +public: + static void filter(const ctype* filter, + input_filter_compute_type* filter_transform_buf, + input_filter_compute_type* transform_mid_buf, size_t OC, + size_t IC, size_t oc_start, size_t oc_end, size_t m, + size_t r, const std::vector& interp_points, + DType dtype, float rescale) { + megdnn_assert( + (oc_end - oc_start) % 8 == 0 && oc_start % 8 == 0 && + oc_end % 8 == 0 && IC % 8 == 0 && OC % 8 == 0, + "Winograd filter transform input param is not times of 8!"); + + size_t alpha = m + r - 1; + WinogradCoeff winograd_coeff(m, r, + interp_points); + + input_filter_compute_type* mid_buf1 = transform_mid_buf; + input_filter_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha; + + Getter getter(dtype); + size_t OCB = OC / pack_size; + size_t ICB = IC / pack_size; + for (size_t oc = oc_start; oc < oc_end; oc++) { + rep(ic, IC) { + size_t ocb = oc / pack_size; + size_t oc_pack = oc % pack_size; + size_t icb = ic / pack_size; + size_t ic_pack = ic % pack_size; + + const ctype* filter_ptr = + filter + (ocb * (IC / 8) + icb) * r * r * 8 * 8 + + ic_pack * 8 + oc_pack; + rep(i, r) rep(j, r) { + mid_buf1[i * r + j] = + getter(filter_ptr[(i * r + j) * 8 * 8]); + } + + /* tmp = Matmul(G, src) */ + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.G(rescale).data(), mid_buf1, mid_buf2, + alpha, r, r, r, r, r, dtype, dtype); + /* dst = Matmul(tmp, G^T) */ + megdnn::naive::run_matrix_mul_tpl( + mid_buf2, winograd_coeff.G(rescale).data(), mid_buf1, + alpha, alpha, r, r, r, alpha, dtype, dtype); + + rep(i, alpha) rep(j, alpha) { + filter_transform_buf[(i * alpha + j) * OCB * ICB * + pack_size * pack_size + + ocb * ICB * pack_size * pack_size + + icb * pack_size * pack_size + + ic_pack * pack_size + oc_pack] = + mid_buf1[i * alpha + j]; + } + } + } + } + + static void input(const ctype* input, + input_filter_compute_type* input_transform_buf, + input_filter_compute_type* transform_mid_buf, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t IC, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float rescale) { + size_t alpha = m + r - 1; + Getter getter(dtype); + WinogradCoeff winograd_coeff(m, r, + interp_points); + size_t ICB = IC / pack_size; + rep(ic, IC) { + size_t icb = ic / pack_size; + size_t ic_pack = ic % pack_size; + input_filter_compute_type* mid_buf1 = transform_mid_buf; + input_filter_compute_type* mid_buf2 = + transform_mid_buf + alpha * alpha; + + memset(mid_buf1, 0, + alpha * alpha * sizeof(input_filter_compute_type)); + rep(i, alpha) rep(j, alpha) { + int ih = ih_start + i; + int iw = iw_start + j; + if (ih >= 0 && ih < (int)IH && iw >= 0 && iw < (int)IW) { + mid_buf1[i * alpha + j] = getter( + input[(icb * IH * IW + ih * IW + iw) * pack_size + + ic_pack]); + } + } + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.B(rescale).data(), mid_buf1, mid_buf2, alpha, + alpha, alpha, alpha, alpha, alpha, dtype, dtype); + megdnn::naive::run_matrix_mul_tpl( + mid_buf2, winograd_coeff.B(rescale).data(), mid_buf1, alpha, + alpha, alpha, alpha, alpha, alpha, dtype, dtype); + rep(i, alpha) rep(j, alpha) { + input_transform_buf[(i * alpha + j) * ICB * nr_units_in_tile * + pack_size + + icb * nr_units_in_tile * pack_size + + unit_idx * pack_size + ic_pack] = + mid_buf1[i * alpha + j]; + } + } + } + + static void output(const output_compute_type* output_transform_buf, + const output_compute_type* bias, dst_type* output, + output_compute_type* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t oh_start, + size_t ow_start, size_t OH, size_t OW, size_t oc_start, + size_t oc_end, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float input_filter_scale, float input_filter_rescale, + float rescale) { + size_t alpha = m + r - 1; + size_t OC = oc_end - oc_start; + + OutputGetter getter(dtype); + winograd::WinogradCoeff winograd_coeff( + m, r, interp_points); + size_t OCB = OC / pack_size; + for (size_t oc = oc_start; oc < oc_end; oc++) { + output_compute_type* mid_buf1 = transform_mid_buf; + output_compute_type* mid_buf2 = transform_mid_buf + alpha * alpha; + + size_t ocb = (oc - oc_start) / pack_size; + size_t oc_pack = oc % pack_size; + // gather + rep(i, alpha) rep(j, alpha) { + mid_buf1[i * alpha + j] = output_transform_buf + [(i * alpha + j) * OCB * nr_units_in_tile * pack_size + + ocb * nr_units_in_tile * pack_size + + unit_idx * pack_size + oc_pack]; + } + /* A[alpha*m] M[alpha*alpha] */ + megdnn::naive::run_matrix_mul_tpl( + winograd_coeff.A(rescale).data(), mid_buf1, mid_buf2, m, + alpha, alpha, m, alpha, alpha, dtype, dtype); + megdnn::naive::run_matrix_mul_tpl< + output_compute_type, output_compute_type, false, false>( + mid_buf2, winograd_coeff.A(rescale).data(), mid_buf1, m, m, + alpha, alpha, m, m, dtype, dtype); + rep(i, m) rep(j, m) { + auto oh = oh_start + i; + auto ow = ow_start + j; + if (oh < OH && ow < OW) { + float val = mid_buf1[i * m + j]; + if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { + val += bias[oc] * input_filter_rescale * + input_filter_rescale; + } else if (bmode == BiasMode::BIAS) { + val += bias[(oc / pack_size * OH * OW + oh * OW + ow) * + pack_size + + oc_pack] * + input_filter_rescale * input_filter_rescale; + } + val = val * input_filter_scale / + (input_filter_rescale * input_filter_rescale * + rescale * rescale); + if (nonline_mode == NonlineMode::RELU) { + val = val > 0 ? val : 0; + } else if (nonline_mode == NonlineMode::SIGMOID) { + val = 1.f / (expf(-val) + 1.f); + } else if (nonline_mode == NonlineMode::H_SWISH) { + val = val * std::min(std::max(val + 3, 0.f), 6.f) / 6.f; + } else { + megdnn_assert(nonline_mode == NonlineMode::IDENTITY); + } + + output[(oc / pack_size * OH * OW + oh * OW + ow) * + pack_size + + oc_pack] = getter(val); + } + } + } + } + + static size_t pack_size; +}; + +template +size_t StrategyHelperNchwxx< + ctype, dst_type, input_filter_compute_type, output_compute_type, format, + std::enable_if_t>::pack_size = + MatrixMulForward::pack_size(format); + +#define INST(_ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type) \ + template class StrategyHelperNchwxx< \ + _ctype, _dst_type, _input_filter_compute_type, \ + _output_compute_type, param::MatrixMul::Format::MK8>; +INST(float, float, float, float) +#undef INST + + + +} // namespace winograd +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/common/winograd/winograd_helper.h b/dnn/src/common/winograd/winograd_helper.h new file mode 100644 index 00000000..bdbec620 --- /dev/null +++ b/dnn/src/common/winograd/winograd_helper.h @@ -0,0 +1,107 @@ +/** + * \file dnn/src/common/winograd/winograd_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include "megdnn/dtype.h" +#include "megdnn/oprs.h" + +namespace megdnn { +namespace winograd { + +using NonlineMode = ::megdnn::ConvBias::Param::NonlineMode; +using BiasMode = ConvBiasForward::BiasMode; +/** + * \brief Strategy helper, contains some helper function for debug kernel + * implementation + * + * \warning The layout should be NCHW + */ +template +class StrategyHelper { +public: + static void filter(const ctype* filter, + input_filter_compute_type* filter_transform_buf, + input_filter_compute_type* transform_mid_buf, size_t OC, + size_t IC, size_t oc_start, size_t oc_end, size_t m, + size_t r, const std::vector& interp_points, + DType dtype, float rescale = 1.0f); + + static void input(const ctype* input, + input_filter_compute_type* input_transform_buf, + input_filter_compute_type* transform_mid_buf, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t IC, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float rescale = 1.0f); + + static void + output(const output_compute_type* output_transform_buf, + const output_compute_type* bias, dst_type* output, + output_compute_type* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t oh_start, size_t ow_start, + size_t OH, size_t OW, size_t oc_start, size_t oc_end, + size_t unit_idx, size_t nr_units_in_tile, size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float input_filter_scale = 1.0f, // input_scale * filter_scale + float input_filter_rescale = 1.0f, // input_rescale * filter_rescale + float rescale = 1.0f); +}; + +/** + * \brief Strategy helper, contains some helper function for debug kernel + * implementation + * + * \warning The layout should be NCHW88 + */ +template +class StrategyHelperNchwxx { +public: + static void filter(const ctype* filter, + input_filter_compute_type* filter_transform_buf, + input_filter_compute_type* transform_mid_buf, size_t OC, + size_t IC, size_t oc_start, size_t oc_end, size_t m, + size_t r, const std::vector& interp_points, + DType dtype, float rescale = 1.0f); + + static void input(const ctype* input, + input_filter_compute_type* input_transform_buf, + input_filter_compute_type* transform_mid_buf, + int ih_start, int iw_start, size_t IH, size_t IW, + size_t IC, size_t unit_idx, size_t nr_units_in_tile, + size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float rescale = 1.0f); + + static void + output(const output_compute_type* output_transform_buf, + const output_compute_type* bias, dst_type* output, + output_compute_type* transform_mid_buf, BiasMode bmode, + NonlineMode nonline_mode, size_t oh_start, size_t ow_start, + size_t OH, size_t OW, size_t oc_start, size_t oc_end, + size_t unit_idx, size_t nr_units_in_tile, size_t m, size_t r, + const std::vector& interp_points, DType dtype, + float input_filter_scale = 1.0f, // input_scale * filter_scale + float input_filter_rescale = 1.0f, // input_rescale * filter_rescale + float rescale = 1.0f); +}; + +} // namespace winograd +} // namespace megdnn + // vim: syntax=cpp.doxygen diff --git a/dnn/src/common/winograd_filter_preprocess.cpp b/dnn/src/common/winograd_filter_preprocess.cpp new file mode 100644 index 00000000..c3471ed9 --- /dev/null +++ b/dnn/src/common/winograd_filter_preprocess.cpp @@ -0,0 +1,141 @@ +/** + * \file dnn/src/common/winograd_filter_preprocess.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/oprs.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +void WinogradFilterPreprocess::deduce_layout(const TensorLayout& src, + TensorLayout& dst) { + auto errmsg = [&]() { + return "invalid filter layout:" + megdnn_layout_msg(src); + }; + MEGDNN_MARK_USED_VAR(errmsg); + //! NCHW88 weight layout include + //! dense{oc/8, ic/8, fh, fw, 8, 8}; group {g, oc/8, ic/8, fh, fw, 8, 8}; + //! channel wise{g/8, 1, 1, fh, fw, 8} + megdnn_assert( + src.ndim == 4 || src.ndim == 5 || src.ndim == 6 || src.ndim == 7, + "%s", errmsg().c_str()); + //! nchw88 channel wise conv + megdnn_assert(!(src.ndim == 6 && src[1] == 1 && src[2] == 1), + "chennel wise nchw88 can not use winograd "); + //! nchw88 group conv + size_t flt_start = 0; + size_t pack_c_size = 1; + size_t group = 1; + //! group conv + if (src.ndim == 5) { + flt_start = 1; + group = src[0]; + //! nchw88 dense conv + } else if (src.ndim == 6) { + pack_c_size = src[5]; + //! nchw88 group conv + } else if (src.ndim == 7) { + flt_start = 1; + group = src[0]; + pack_c_size = src[6]; + } + size_t OC = src[flt_start] * pack_c_size, + IC = src[flt_start + 1] * pack_c_size, FH = src[flt_start + 2], + FW = src[flt_start + 3]; + size_t m = param().output_block_size; + megdnn_assert(FH == FW, "%s", errmsg().c_str()); + + size_t alpha = FH + m - 1; + DType dst_type = src.dtype; + if (src.dtype.category() == DTypeCategory::QUANTIZED) { + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); + dst_type = dtype::QuantizedS16( + src.dtype.param().scale); + } + + if (src.ndim == 4 || src.ndim == 6) { + if (param().format == param::Winograd::Format::DEFAULT) { + dst = TensorLayout({alpha, alpha, IC, OC}, dst_type); + } else { + megdnn_assert(param().format == param::Winograd::Format::MK4 || + param().format == param::Winograd::Format::MK8); + size_t pack_size = MatrixMulForward::pack_size(param().format); + dst = TensorLayout({alpha, alpha, OC / pack_size, IC / pack_size, + pack_size, pack_size}, + dst_type); + } + } else { + megdnn_assert(src.ndim == 5 || src.ndim == 7); + if (param().format == param::Winograd::Format::DEFAULT) { + dst = TensorLayout({group, alpha, alpha, IC, OC}, dst_type); + } else { + megdnn_assert(param().format == param::Winograd::Format::MK4 || + param().format == param::Winograd::Format::MK8); + size_t pack_size = MatrixMulForward::pack_size(param().format); + dst = TensorLayout({group, alpha, alpha, OC / pack_size, + IC / pack_size, pack_size, pack_size}, + dst_type); + } + } +} + +void WinogradFilterPreprocess::check_exec(const TensorLayout& src, + const TensorLayout& dst, + size_t workspace_in_bytes) { + auto errmsg = [&]() { + return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst); + }; + MEGDNN_MARK_USED_VAR(errmsg); + megdnn_assert_contiguous(src); + megdnn_assert_contiguous(dst); + //! nchwxx now only support Format MKx + if (param().format == param::Winograd::Format::DEFAULT) { + megdnn_assert(src.ndim == dst.ndim && (src.ndim == 4 || src.ndim == 5), + "%s", errmsg().c_str()); + } else { + megdnn_assert( + (param().format == param::Winograd::Format::MK4 || + param().format == param::Winograd::Format::MK8) && + (src.ndim == dst.ndim - 2 || src.ndim == dst.ndim) && + (src.ndim == 4 || src.ndim == 5 || src.ndim == 6 || + src.ndim == 7), + "%s", errmsg().c_str()); + } + + TensorLayout dst_expected; + deduce_layout(src, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); +} + +size_t WinogradFilterPreprocess::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& dst) { + MEGDNN_MARK_USED_VAR(dst); + DType output_compute_dtype = src.dtype; + if (src.dtype.category() == DTypeCategory::QUANTIZED) { + megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || + src.dtype.enumv() == DTypeEnum::Quantized8Asymm); + output_compute_dtype = dtype::QuantizedS16( + src.dtype.param().scale); + } + + size_t FW = src[3]; + if (src.ndim == 5 || src.ndim == 7) { + FW = src[4]; + } + + size_t pack_size = MatrixMulForward::pack_size(param().format); + size_t alpha = param().output_block_size + FW - 1; + return 2 * alpha * alpha * output_compute_dtype.size() * pack_size * + pack_size; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/add_update/kern.cu b/dnn/src/cuda/add_update/kern.cu new file mode 100644 index 00000000..f4f32b8c --- /dev/null +++ b/dnn/src/cuda/add_update/kern.cu @@ -0,0 +1,31 @@ +/** + * \file dnn/src/cuda/add_update/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" + +namespace megdnn { +namespace cuda { + +#define cb(_dtype) \ + INST_RUN_ELEMWISE( \ + AddUpdateKernOp::ctype>, \ + DTypeTrait<_dtype>::ctype, 1); \ + INST_RUN_ELEMWISE( \ + AddUpdateKernOpNonContig::ctype>, \ + DTypeTrait<_dtype>::ctype, 2); + +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +} // namespace megdnn +} // namespace cuda + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/add_update/kern.cuh b/dnn/src/cuda/add_update/kern.cuh new file mode 100644 index 00000000..49bac12b --- /dev/null +++ b/dnn/src/cuda/add_update/kern.cuh @@ -0,0 +1,113 @@ +/** + * \file dnn/src/cuda/add_update/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/utils.cuh" +#include "src/cuda/elemwise_helper.cuh" + +#if MEGDNN_CC_HOST +#include "megdnn/oprs.h" +#endif + +namespace megdnn{ +namespace cuda { + + template + struct AddUpdateKernOp { + ctype *dst; + ctype alpha, beta, bias; + + __device__ void operator() (uint32_t idx, ctype delta) { + dst[idx] = dst[idx] * alpha + delta * beta + bias; + } + +#if MEGDNN_CC_HOST + AddUpdateKernOp(const TensorND &dest, const AddUpdate::Param ¶m): + dst{dest.ptr()}, + alpha(param.alpha), beta(param.beta), bias(param.bias) + { + } +#endif + }; + + template + struct AddUpdateKernOp< + ctype, typename std::enable_if< + std::is_same::value || + std::is_same::value>::type> { + typedef typename elemwise_intl::VectTypeTrait::vect_type + vect_type; + ctype* dst; + ctype alpha, beta, bias; + __device__ void operator()(uint32_t idx, ctype delta) { + dst[idx] = dst[idx] * alpha + delta * beta + bias; + } + __device__ void operator()(uint32_t idx, vect_type delta) { + vect_type& x = *(vect_type*)(&dst[idx]); + x.x = x.x * alpha + delta.x * beta + bias; + x.y = x.y * alpha + delta.y * beta + bias; + x.z = x.z * alpha + delta.z * beta + bias; + x.w = x.w * alpha + delta.w * beta + bias; + } +#if MEGDNN_CC_HOST + AddUpdateKernOp(const TensorND& dest, const AddUpdate::Param& param) + : dst{dest.ptr()}, + alpha(param.alpha), + beta(param.beta), + bias(param.bias){}; +#endif + }; + + template + struct AddUpdateKernOpNonContig { + ctype alpha, beta, bias; + + __device__ void operator() (uint32_t /*idx*/, ctype &dst, ctype delta) { + dst = dst * alpha + delta * beta + bias; + } + +#if MEGDNN_CC_HOST + AddUpdateKernOpNonContig(const AddUpdate::Param ¶m): + alpha(param.alpha), beta(param.beta), bias(param.bias) + { + } +#endif + }; + + template + struct AddUpdateKernOpNonContig< + ctype, typename std::enable_if< + std::is_same::value || + std::is_same::value>::type> { + typedef typename elemwise_intl::VectTypeTrait::vect_type + vect_type; + ctype alpha, beta, bias; + __device__ void operator()(uint32_t, ctype& dst, ctype delta) { + dst = dst * alpha + delta * beta + bias; + } + __device__ void operator()(uint32_t, vect_type& dst, vect_type delta) { + dst.x = dst.x * alpha + delta.x * beta + bias; + dst.y = dst.y * alpha + delta.y * beta + bias; + dst.z = dst.z * alpha + delta.z * beta + bias; + dst.w = dst.w * alpha + delta.w * beta + bias; + } +#if MEGDNN_CC_HOST + AddUpdateKernOpNonContig(const AddUpdate::Param& param) + : alpha(param.alpha), beta(param.beta), bias(param.bias) {} +#endif + }; + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/add_update/opr_impl.cpp b/dnn/src/cuda/add_update/opr_impl.cpp new file mode 100644 index 00000000..f3bddbcf --- /dev/null +++ b/dnn/src/cuda/add_update/opr_impl.cpp @@ -0,0 +1,67 @@ +/** + * \file dnn/src/cuda/add_update/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./opr_impl.h" + +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +void AddUpdateForwardImpl::exec( + _megdnn_tensor_inout dest, _megdnn_tensor_in delta) { + check_exec(dest.layout, delta.layout); + if (!dest.layout.is_contiguous()) { + return exec_noncontig(dest, delta); + } + ElemwiseOpParamN<1> param; + param[0] = delta; + param[0].layout = param[0].layout.broadcast(dest.layout); + param.init_from_given_tensor(); + auto stream = cuda_stream(handle()); + switch (dest.layout.dtype.enumv()) { + +#define cb(_dt) case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + return run_elemwise, ctype, 1>( \ + param, stream, {dest, m_param}); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + + default: + megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate")); + } +} + +void AddUpdateForwardImpl::exec_noncontig( + _megdnn_tensor_inout dest, _megdnn_tensor_in delta) { + + ElemwiseOpParamN<2> param = make_param(dest, delta); + auto stream = cuda_stream(handle()); + switch (dest.layout.dtype.enumv()) { + +#define cb(_dt) case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + return run_elemwise, ctype, 2>( \ + param, stream, {m_param}); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + + default: + megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate")); + } +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/add_update/opr_impl.h b/dnn/src/cuda/add_update/opr_impl.h new file mode 100644 index 00000000..19d737c1 --- /dev/null +++ b/dnn/src/cuda/add_update/opr_impl.h @@ -0,0 +1,35 @@ +/** + * \file dnn/src/cuda/add_update/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" +#include "src/common/add_update_helper.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +class AddUpdateForwardImpl final : public AddUpdateForwardHelper { + void exec_noncontig(_megdnn_tensor_inout dest, _megdnn_tensor_in delta); + +public: + using AddUpdateForwardHelper::AddUpdateForwardHelper; + + void exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) override; + + bool is_thread_safe() const override { return true; } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/argmxx/argmxx.cu b/dnn/src/cuda/argmxx/argmxx.cu new file mode 100644 index 00000000..406678fb --- /dev/null +++ b/dnn/src/cuda/argmxx/argmxx.cu @@ -0,0 +1,26 @@ +/** + * \file dnn/src/cuda/argmxx/argmxx.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/common/argmxx_helper.h" + +#include "src/cuda/reduce_helper.cuh" +#include "megdnn/dtype.h" + +namespace megdnn { +namespace cuda { + +#define INST(_dt) \ + INST_REDUCE(argmxx::ArgmxxOp::ctype MEGDNN_COMMA false>, false); \ + INST_REDUCE(argmxx::ArgmxxOp::ctype MEGDNN_COMMA true>, false); \ + + MEGDNN_FOREACH_COMPUTING_DTYPE(INST) + +} // namespace argmxx +} // namespace megdnn diff --git a/dnn/src/cuda/argmxx/argmxx.cuh b/dnn/src/cuda/argmxx/argmxx.cuh new file mode 100644 index 00000000..b1787bcf --- /dev/null +++ b/dnn/src/cuda/argmxx/argmxx.cuh @@ -0,0 +1,12 @@ +/** + * \file dnn/src/cuda/argmxx/argmxx.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/argmxx/opr_impl.cpp b/dnn/src/cuda/argmxx/opr_impl.cpp new file mode 100644 index 00000000..203e23cf --- /dev/null +++ b/dnn/src/cuda/argmxx/opr_impl.cpp @@ -0,0 +1,124 @@ +/** + * \file dnn/src/cuda/argmxx/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/argmxx/opr_impl.h" + +#include "src/cuda/utils.h" +#include "src/common/reduce_helper.h" +#include "src/common/argmxx_helper.h" +#include "src/cuda/reduce_helper.cuh" + +namespace { + +using namespace megdnn; +using namespace cuda; +using namespace argmxx; + +template +size_t get_workspace_in_bytes_impl(const TensorLayout &src, + const TensorLayout & /* dst */, + size_t axis) +{ + size_t A, B, C; + reduce::get_ABC(src, A, B, C, axis); + return get_reduce_workspace_in_bytes>( + A, B, C); +} + +template +void exec_impl(const T *src, int *dst, void *workspace, + size_t A, size_t B, size_t C, + cudaStream_t stream) +{ + argmxx::ArgmxxOp opr(const_cast(src), dst, A, B, C); + run_reduce, false>( + (typename argmxx::ArgmxxOp::wtype *)workspace, + A, B, C, + stream, opr); + after_kernel_launch(); +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { + +size_t ArgmaxForwardImpl::get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) +{ +#define cb(DType) \ + if (src.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + return get_workspace_in_bytes_impl(src, dst, param().axis); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(false); +} + +void ArgmaxForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, workspace.size); + size_t A, B, C; + reduce::get_ABC(src.layout, A, B, C, param().axis); + auto stream = cuda_stream(handle()); +#define cb(DType) \ + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_impl(src.ptr(), \ + dst.ptr(), \ + workspace.raw_ptr, \ + A, B, C, stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +#undef cb +} + +size_t ArgminForwardImpl::get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) +{ +#define cb(DType) \ + if (src.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + return get_workspace_in_bytes_impl(src, dst, param().axis); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(false); +} + +void ArgminForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, workspace.size); + size_t A, B, C; + reduce::get_ABC(src.layout, A, B, C, param().axis); + auto stream = cuda_stream(handle()); +#define cb(DType) \ + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_impl(src.ptr(), \ + dst.ptr(), \ + workspace.raw_ptr, \ + A, B, C, stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +#undef cb +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/argmxx/opr_impl.h b/dnn/src/cuda/argmxx/opr_impl.h new file mode 100644 index 00000000..167e740d --- /dev/null +++ b/dnn/src/cuda/argmxx/opr_impl.h @@ -0,0 +1,41 @@ +/** + * \file dnn/src/cuda/argmxx/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class ArgmaxForwardImpl final: public ArgmaxForward { + public: + using ArgmaxForward::ArgmaxForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) override; +}; + +class ArgminForwardImpl: public ArgminForward { + public: + using ArgminForward::ArgminForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) override; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/argsort/argsort.cu b/dnn/src/cuda/argsort/argsort.cu new file mode 100644 index 00000000..9e46c169 --- /dev/null +++ b/dnn/src/cuda/argsort/argsort.cu @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/argsort/argsort.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./argsort.cuh" +#include "./bitonic_sort.cuh" +#include "megdnn/basic_types.h" +#include "src/cuda/utils.cuh" + +#include "src/cuda/cub/device/device_radix_sort.cuh" +#include "src/cuda/cub/device/device_segmented_radix_sort.cuh" + +using namespace megdnn; +using namespace cuda; + +namespace { +struct StridedOffsetIterator { + int bias, stride; + + StridedOffsetIterator(int bias_, int stride_) + : bias(bias_), stride(stride_) {} + + __device__ __forceinline__ int operator[](int i) const { + return stride * i + bias; + } +}; + +bool use_bitonic(uint32_t /*M*/, uint32_t N) { + // bitonic sort is preferred when N is small (alwyas faster than radix sort) + return N <= BITONIC_SORT_MAX_LENGTH; +} + +bool use_segmented(uint32_t M, uint32_t /*N*/) { + // an empirical value: + // sort(1, 1e6): 0.574ms + // segsort({1,2,8,16}, 1e6): 7-8ms + // sort(1, 1e7): 3.425ms + // segsort({1,2,8,16}, 1e7): 71-84ms + // + // segsort is about 7x-10x slower than sort on small batches, so we can + // expect it to be faster than sort when batch is large enough. + return M >= 8; +} + +template +MEGDNN_NOINLINE size_t cub_sort_pairs( + bool is_ascending, void* workspace, size_t workspace_size, + const KeyType* keys_in, KeyType* keys_out, const int* values_in, + int* values_out, uint32_t M, uint32_t N, cudaStream_t stream) { + cudaError_t err; + if (use_segmented(M, N)) { + if (is_ascending) { + err = cub::DeviceSegmentedRadixSort::SortPairs( + workspace, workspace_size, keys_in, keys_out, values_in, + values_out, N * M, M, StridedOffsetIterator(0, N), + StridedOffsetIterator(N, N), 0, sizeof(float) * 8, stream); + } else { + err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + workspace, workspace_size, keys_in, keys_out, values_in, + values_out, N * M, M, StridedOffsetIterator(0, N), + StridedOffsetIterator(N, N), 0, sizeof(float) * 8, stream); + } + } else { + if (is_ascending) { + for (size_t i = 0; i < M; ++i) { + err = cub::DeviceRadixSort::SortPairs( + workspace, workspace_size, keys_in + N * i, + keys_out + N * i, values_in + N * i, values_out + N * i, + N, 0, sizeof(float) * 8, stream); + cuda_check(err); + if (!keys_in) { + return workspace_size; + } + } + } else { + for (size_t i = 0; i < M; ++i) { + err = cub::DeviceRadixSort::SortPairsDescending( + workspace, workspace_size, keys_in + N * i, + keys_out + N * i, values_in + N * i, values_out + N * i, + N, 0, sizeof(float) * 8, stream); + cuda_check(err); + if (!keys_in) { + return workspace_size; + } + } + } + } + return workspace_size; +} + +__global__ void kern_arange(int* dst, uint32_t n, uint32_t mod) { + uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < n) { + dst[i] = i % mod; + } +} + +template +size_t get_sort_workspace(uint32_t M, uint32_t N, bool is_ascending) { + if (use_bitonic(M, N)) { + return 0; + } + return cub_sort_pairs(is_ascending, NULL, 0, NULL, NULL, NULL, NULL, + M, N, NULL); +} +} // anonymous namespace + +size_t argsort::get_fwd_workspace_in_bytes(uint32_t M, uint32_t N, DType dtype, + bool is_ascending, + bool iptr_src_given) { + size_t size = 0; + switch (dtype.enumv().ev) { +#define cb(ctype) \ + case DTypeTrait::enumv: \ + size = get_sort_workspace(M, N, is_ascending); \ + break; + ARGSORT_FOREACH_CTYPE(cb) +#undef cb + default: + megdnn_throw("argsort only supports float and int32"); + } + if (!iptr_src_given) { + size = DIVUP(size, sizeof(float)) * sizeof(float) + M * N * sizeof(int); + } + return size; +} + +template +void argsort::forward(const dtype* sptr, dtype* dptr, int* iptr, + void* workspace, uint32_t M, uint32_t N, + bool is_ascending, cudaStream_t stream, + const int* iptr_src) { + size_t wk_size = get_sort_workspace(M, N, is_ascending); + if (!iptr_src) { + int* ptr = reinterpret_cast(static_cast(workspace) + + DIVUP(wk_size, sizeof(float)) * + sizeof(float)); + kern_arange<<>>(ptr, M * N, N); + iptr_src = ptr; + } + + if (use_bitonic(M, N)) { + cuda_check(bitonic_sort(M, N, sptr, iptr_src, dptr, iptr, is_ascending, + stream)); + } else { + cub_sort_pairs(is_ascending, workspace, wk_size, sptr, dptr, iptr_src, + iptr, M, N, stream); + } +} + +namespace megdnn { +namespace cuda { +#define INST_FORWARD(dtype) \ + template void argsort::forward(const dtype*, dtype*, int*, void*, \ + uint32_t, uint32_t, bool, \ + cudaStream_t, const int*); +ARGSORT_FOREACH_CTYPE(INST_FORWARD) +#undef INST_FORWARD +} +} // namespace megdnn +// vim: ft=cuda syntax=cuda.doxygen + diff --git a/dnn/src/cuda/argsort/argsort.cuh b/dnn/src/cuda/argsort/argsort.cuh new file mode 100644 index 00000000..d5cc6e12 --- /dev/null +++ b/dnn/src/cuda/argsort/argsort.cuh @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/argsort/argsort.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include +#include "megdnn/dtype.h" + +namespace megdnn { +namespace cuda { +namespace argsort { + +size_t get_fwd_workspace_in_bytes(uint32_t M, uint32_t N, DType dtype, + bool is_ascending, + bool iptr_src_given = false); + +/*! + * \param iptr_src pointer to indices; a range would be generated if it is null + */ +template +void forward(const dtype* sptr, dtype* dptr, int* iptr, void* workspace, + uint32_t M, uint32_t N, bool is_ascending, cudaStream_t stream, + const int* iptr_src = NULL); + +//! iterate over all supported data types +#define ARGSORT_FOREACH_CTYPE(cb) cb(float) cb(int32_t) + +} // namespace argsort +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/argsort/backward.cu b/dnn/src/cuda/argsort/backward.cu new file mode 100644 index 00000000..b8b3d3fa --- /dev/null +++ b/dnn/src/cuda/argsort/backward.cu @@ -0,0 +1,65 @@ +/** + * \file dnn/src/cuda/argsort/backward.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./argsort.cuh" +#include "./backward.cuh" + +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace argsort; + +namespace { + +template +__global__ void backward_kernel(uint32_t dst_w, uint32_t src_w, + uint32_t src_size, T* dst, const T* src_data, + const int* src_idx) { + uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < src_size) { + uint32_t r = idx / src_w; + dst[r * dst_w + src_idx[idx]] = src_data[idx]; + } +} + +} // namespace + +template +void argsort::backward_proxy(uint32_t dst_h, uint32_t dst_w, uint32_t src_w, + T* dst, const T* src_data, const int* src_idx, + cudaStream_t stream) { + if (dst_w != src_w) { + cudaMemsetAsync(dst, 0, dst_h * dst_w * sizeof(T), stream); + } + + uint32_t src_size = dst_h * src_w; + backward_kernel<<>>( + dst_w, src_w, src_size, dst, src_data, src_idx); + after_kernel_launch(); +} + +namespace megdnn { +namespace cuda { +namespace argsort { + +#define INST(T) \ + template void backward_proxy(uint32_t dst_h, uint32_t dst_w, \ + uint32_t src_w, T* dst, const T* src_data, \ + const int* src_idx, cudaStream_t stream); +ARGSORT_FOREACH_CTYPE(INST) +#undef INST + +} // namespace argsort +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/argsort/backward.cuh b/dnn/src/cuda/argsort/backward.cuh new file mode 100644 index 00000000..c42db6b6 --- /dev/null +++ b/dnn/src/cuda/argsort/backward.cuh @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/argsort/backward.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include + +namespace megdnn { +namespace cuda { +namespace argsort { + +template +void backward_proxy(uint32_t dst_h, uint32_t dst_w, uint32_t src_w, T* dst, + const T* src_data, const int* src_idx, cudaStream_t stream); + +} // namespace argsort +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/argsort/bitonic_sort.cu b/dnn/src/cuda/argsort/bitonic_sort.cu new file mode 100644 index 00000000..43dd2a51 --- /dev/null +++ b/dnn/src/cuda/argsort/bitonic_sort.cu @@ -0,0 +1,304 @@ +/** + * \file dnn/src/cuda/argsort/bitonic_sort.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./bitonic_sort.cuh" +#include "src/cuda/query_blocksize.cuh" + +#if __CUDACC_VER_MAJOR__ < 9 +#pragma message "warp sync disabled due to insufficient cuda version" +#define __syncwarp __syncthreads +#endif + +#include +#include + +using namespace megdnn; +using namespace cuda; + +namespace bitonic_sort_impl { + +//! load keys and init idx +template +__device__ __forceinline__ void safe_load0(T* dst, uint16_t* idx, const T* src, + uint32_t id, uint32_t size) { + dst[id] = id < size ? src[id] : CompareLess::template max(); + idx[id] = id; +} + +//! load values +template +__device__ __forceinline__ void safe_load1(T* dst, const T* src, uint32_t id, + uint32_t size) { + // broadcast last value to avoid out-of-bound values (for example, when + // input contains NaN) + dst[id] = src[min(id, size - 1)]; +} + +//! write keys +template +__device__ __forceinline__ void safe_write0(T* dst, const T* src, uint32_t id, + uint32_t size) { + if (id < size) { + dst[id] = src[id]; + } +} + +//! write values +template +__device__ __forceinline__ void safe_write1(T* dst, const T* src, + const uint16_t* remap, uint32_t id, + uint32_t size) { + if (id < size) { + dst[id] = src[remap[id]]; + } +} + +struct SyncWarp { + static __device__ __forceinline__ void s() { __syncwarp(); } +}; +struct SyncBlock { + static __device__ __forceinline__ void s() { __syncthreads(); } +}; + +template +struct NumTrait; +template <> +struct NumTrait { + static __device__ __forceinline__ float max() { return INFINITY; } + static __device__ __forceinline__ float min() { return -INFINITY; } +}; + +template <> +struct NumTrait { + static __device__ __forceinline__ int32_t max() { return INT_MAX; } + static __device__ __forceinline__ int32_t min() { return INT_MIN; } +}; + +struct LessThan { + template + static __device__ __forceinline__ bool cmp(Key k0, Value v0, Key k1, + Value v1) { + return k0 < k1 | ((k0 == k1) & (v0 < v1)); + } + + template + static __device__ __forceinline__ T max() { + return NumTrait::max(); + } +}; + +struct GreaterThan { + template + static __device__ __forceinline__ bool cmp(Key k0, Value v0, Key k1, + Value v1) { + return k0 > k1 | ((k0 == k1) & (v0 < v1)); + } + + template + static __device__ __forceinline__ T max() { + return NumTrait::min(); + } +}; + +template +union KVUnion { + Key key; + Value value; +}; + +template +static int get_shmem(int block_size, void* = NULL) { + return (sizeof(KVUnion) + sizeof(uint16_t)) * block_size * 4; +} + +/*! + * \brief batched bitonic sort (M, N) for small N + * + * launch configuration: + * grid(X) + * block(N/4, Y) + * + * where N / 4 == 1 << nr_th_log2 + */ +template +static __global__ void kern(uint32_t batch, uint32_t length, const Key* key_inp, + const Value* value_inp, Key* key_out, + Value* value_out) { + const uint32_t nr_th = 1 << nr_th_log2; + + // 24KiB shared memory for 4-byte keys for 1024 threads + extern __shared__ uint8_t smem_storage[]; + uint16_t* idx_storage = reinterpret_cast(smem_storage); + KVUnion* keys_storage = reinterpret_cast*>( + idx_storage + blockDim.y * (nr_th * 4)); + + uint32_t cur_batch = blockIdx.x * blockDim.y + threadIdx.y, + off = cur_batch * length; + key_inp += off; + key_out += off; + value_inp += off; + value_out += off; + + uint32_t storage_offset = threadIdx.y * (nr_th * 4); + uint16_t* values = idx_storage + storage_offset; + Key* keys = reinterpret_cast(keys_storage + storage_offset); + uint32_t tid0 = threadIdx.x, tid1 = tid0 + nr_th, + cur_length = cur_batch < batch ? length : 0; + safe_load0(keys, values, key_inp, tid0, cur_length); + safe_load0(keys, values, key_inp, tid0 + nr_th, cur_length); + safe_load0(keys, values, key_inp, tid0 + nr_th * 2, + cur_length); + safe_load0(keys, values, key_inp, tid0 + nr_th * 3, + cur_length); + + Sync::s(); + +#define WORK(_idx, _asc) \ + do { \ + uint32_t _id0 = (_idx), _id1 = _id0 + step; \ + Key _k0 = keys[_id0], _k1 = keys[_id1]; \ + uint16_t _v0 = values[_id0], _v1 = values[_id1]; \ + if (CompareLess::cmp(_k0, _v0, _k1, _v1) != _asc) { \ + keys[_id0] = _k1; \ + keys[_id1] = _k0; \ + values[_id0] = _v1; \ + values[_id1] = _v0; \ + } \ + } while (0) + +#pragma unroll + for (uint32_t slen_log = 0; slen_log <= (nr_th_log2 + 1); ++slen_log) { + // log2 of half of current bitonic sequence (i.e. length of its + // monotonic part) + uint32_t asc0 = !((tid0 >> slen_log) & 1), + asc1 = !((tid1 >> slen_log) & 1); +#pragma unroll + for (uint32_t j = 0; j <= slen_log; ++j) { + uint32_t step = 1 << (slen_log - j), xmask = step - 1, + ymask = ~xmask; + WORK((tid0 & xmask) + ((tid0 & ymask) << 1), asc0); + WORK((tid1 & xmask) + ((tid1 & ymask) << 1), asc1); + Sync::s(); + } + } + +#undef WORK + + if (cur_batch < batch) { + safe_write0(key_out, keys, tid0, length); + safe_write0(key_out, keys, tid0 + nr_th, length); + safe_write0(key_out, keys, tid0 + nr_th * 2, length); + safe_write0(key_out, keys, tid0 + nr_th * 3, length); + + // permute values according to sorted indices + Value* copied_values = reinterpret_cast(keys); + safe_load1(copied_values, value_inp, tid0, cur_length); + safe_load1(copied_values, value_inp, tid0 + nr_th, cur_length); + safe_load1(copied_values, value_inp, tid0 + nr_th * 2, cur_length); + safe_load1(copied_values, value_inp, tid0 + nr_th * 3, cur_length); + Sync::s(); + + safe_write1(value_out, copied_values, values, tid0, length); + safe_write1(value_out, copied_values, values, tid0 + nr_th, length); + safe_write1(value_out, copied_values, values, tid0 + nr_th * 2, length); + safe_write1(value_out, copied_values, values, tid0 + nr_th * 3, length); + } +} + +} // namespace bitonic_sort_impl + +template +cudaError_t cuda::bitonic_sort(uint32_t batch, uint32_t length, + const Key* key_inp, const Value* value_inp, + Key* key_out, Value* value_out, bool ascending, + cudaStream_t stream) { + using namespace bitonic_sort_impl; + if (length == 1) { + if (key_inp != key_out) { + cudaMemcpyAsync(key_out, key_inp, sizeof(Key) * batch, + cudaMemcpyDeviceToDevice, stream); + } + if (value_inp != value_out) { + cudaMemcpyAsync(value_out, value_inp, sizeof(Value) * batch, + cudaMemcpyDeviceToDevice, stream); + } + return cudaGetLastError(); + } + + void (*kptr)(uint32_t, uint32_t, const Key*, const Value*, Key*, Value*) = + NULL; + uint32_t l4 = (length + 3) / 4; + dim3 block; + +#define chk(s) \ + do { \ + if (!kptr && l4 <= (1 << s)) { \ + block.x = 1 << s; \ + if ((1 << s) <= 32) { \ + if (ascending) { \ + kptr = kern; \ + } else { \ + kptr = kern; \ + } \ + } else { \ + if (ascending) { \ + kptr = kern; \ + } else { \ + kptr = kern; \ + } \ + } \ + } \ + } while (0) + + chk(0); + chk(1); + chk(2); + chk(3); + chk(4); + chk(5); + chk(6); + chk(7); + chk(8); + chk(9); + + if (!kptr) { + return cudaErrorInvalidConfiguration; + } + + int suggested_block_size = + query_launch_config_for_kernel(reinterpret_cast(kptr), + get_shmem) + .block_size; + block.y = std::max(suggested_block_size / block.x, 1); + int shmem = get_shmem(block.y * block.x); + kptr<<<(batch - 1) / block.y + 1, block, shmem, stream>>>( + batch, length, key_inp, value_inp, key_out, value_out); + return cudaGetLastError(); +} + +namespace megdnn { +namespace cuda { + +#define INST(k, v) \ + template cudaError_t bitonic_sort(uint32_t, uint32_t, const k*, \ + const v*, k*, v*, bool, \ + cudaStream_t) + +INST(float, int); +INST(int32_t, int); +#undef INST + +} // namespace megdnn +} // namespace megdnn + +// vim: ft=cuda syntax=cuda.doxygen + diff --git a/dnn/src/cuda/argsort/bitonic_sort.cuh b/dnn/src/cuda/argsort/bitonic_sort.cuh new file mode 100644 index 00000000..bc85bd1f --- /dev/null +++ b/dnn/src/cuda/argsort/bitonic_sort.cuh @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/argsort/bitonic_sort.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include + +namespace megdnn { +namespace cuda { + +const uint32_t BITONIC_SORT_MAX_LENGTH = 2048; +// cub radix sort seems to be faster with lengths > 2048 + +/*! + * \brief bitonic sort for k/v pairs + * + * Requires \p length no larger than 4 times of cuda thread num. \p key_inp + * and \p key_out can be identical, and so are \p value_inp and \p value_out. + */ +template +cudaError_t bitonic_sort(uint32_t batch, uint32_t length, const Key* key_inp, + const Value* value_inp, Key* key_out, Value* value_out, + bool ascending, cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/argsort/opr_impl.cpp b/dnn/src/cuda/argsort/opr_impl.cpp new file mode 100644 index 00000000..5a56db21 --- /dev/null +++ b/dnn/src/cuda/argsort/opr_impl.cpp @@ -0,0 +1,79 @@ +/** + * \file dnn/src/cuda/argsort/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./argsort.cuh" +#include "./backward.cuh" + +#include "src/common/utils.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +void ArgsortForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_tensor_out indices, + _megdnn_workspace workspace) { + check_exec(src.layout, dst.layout, indices.layout, workspace.size); + auto M = src.layout.shape[0], N = src.layout.shape[1]; + auto iptr = indices.ptr(); + auto wptr = static_cast(workspace.raw_ptr); + bool is_ascending = (param().order == Order::ASCENDING); + auto stream = cuda_stream(this->handle()); + switch (src.layout.dtype.enumv()) { +#define cb(t) \ + case DTypeTrait::enumv: \ + argsort::forward(src.ptr(), dst.ptr(), iptr, wptr, M, N, \ + is_ascending, stream); \ + break; + ARGSORT_FOREACH_CTYPE(cb); +#undef cb + default: + megdnn_throw(ssprintf("unsupported argsort dtype on cuda: %s", + src.layout.dtype.name())); + } +} + +size_t ArgsortForwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout&, + const TensorLayout&) { + megdnn_assert(src.ndim == 2, "invalid src layout: %s", + src.to_string().c_str()); + auto M = src.shape[0], N = src.shape[1]; + auto&& dtype = src.dtype; + megdnn_assert(std::max(M, N) <= + static_cast(std::numeric_limits::max())); + return argsort::get_fwd_workspace_in_bytes( + M, N, dtype, param().order == Param::Order::ASCENDING); +} + +void ArgsortBackwardImpl::exec(_megdnn_tensor_in diff, + _megdnn_tensor_in indices, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + check_exec(diff.layout, indices.layout, grad.layout, workspace.size); + auto stream = cuda_stream(this->handle()); + switch (diff.layout.dtype.enumv()) { +#define cb(t) \ + case DTypeTrait::enumv: \ + argsort::backward_proxy(grad.layout[0], grad.layout[1], \ + diff.layout[1], grad.ptr(), diff.ptr(), \ + indices.ptr(), stream); \ + break; + ARGSORT_FOREACH_CTYPE(cb); +#undef cb + default: + megdnn_throw(ssprintf("unsupported argsort dtype on cuda: %s", + diff.layout.dtype.name())); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/argsort/opr_impl.h b/dnn/src/cuda/argsort/opr_impl.h new file mode 100644 index 00000000..fbd58e5a --- /dev/null +++ b/dnn/src/cuda/argsort/opr_impl.h @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/argsort/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class ArgsortForwardImpl final: public ArgsortForward { + public: + using ArgsortForward::ArgsortForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_tensor_out indices, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &indices) override; +}; + +class ArgsortBackwardImpl final: public ArgsortBackward { + public: + using ArgsortBackward::ArgsortBackward; + void exec(_megdnn_tensor_in diff, + _megdnn_tensor_in indices, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &, + const TensorLayout &) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/batch_conv_bias/algo.cpp b/dnn/src/cuda/batch_conv_bias/algo.cpp new file mode 100644 index 00000000..705ff270 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/algo.cpp @@ -0,0 +1,65 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +BatchConvBiasForwardImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&int8_nchw4_gemm_dotprod); + all_algos.push_back(&int8_nchw4_implicit_gemm_dotprod); +} + +BatchConvBiasForwardImpl::AlgoPack BatchConvBiasForwardImpl::sm_algo_pack; + +BatchConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( + BatchConvBiasForwardImpl* o, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& bias, + const TensorLayout& z, const TensorLayout& dst) + : opr{o}, + src_layout{src}, + filter_layout{filter}, + bias_layout{bias}, + z_layout{z}, + dst_layout{dst} {} + +BatchConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs( + BatchConvBiasForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout, + dst.layout), + src_tensor{&src}, + filter_tensor{&filter}, + bias_tensor{&bias}, + z_tensor{&z}, + dst_tensor{&dst}, + workspace{workspace} {} + +std::string BatchConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { + auto&& param = opr->param(); + MEGDNN_MARK_USED_VAR(param); + return megdnn_mangle(ssprintf( + "src=%s, filter=%s, bias=%s, z=%s, dst=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, " + "dtype=(%s(src),%s(flt),%s(bias),%s(z))->(%s(dst))", + src_layout.to_string().c_str(), filter_layout.to_string().c_str(), + bias_layout.to_string().c_str(), z_layout.to_string().c_str(), + dst_layout.to_string().c_str(), param.pad_h, param.pad_w, + param.stride_h, param.stride_w, param.dilate_h, param.dilate_w, + static_cast(param.mode), src_layout.dtype.name(), + filter_layout.dtype.name(), bias_layout.dtype.name(), + z_layout.dtype.name(), dst_layout.dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/algo.h b/dnn/src/cuda/batch_conv_bias/algo.h new file mode 100644 index 00000000..6b2668ef --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/algo.h @@ -0,0 +1,123 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/batch_conv_bias/opr_impl.h" +#include "src/cuda/handle.h" + +namespace megdnn { +namespace cuda { + +class BatchConvBiasForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + BatchConvBiasForwardImpl* opr; + TensorLayout src_layout, filter_layout, bias_layout, z_layout, + dst_layout; + + std::string to_string() const; + SizeArgs(BatchConvBiasForwardImpl* opr, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& bias, + const TensorLayout& z, const TensorLayout& dst); + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *filter_tensor, *bias_tensor, *z_tensor, + *dst_tensor; + Workspace workspace; + + ExecArgs(BatchConvBiasForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_in bias, + _megdnn_tensor_in z, _megdnn_tensor_out dst, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "batch conv bias fwd algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdGemm final + : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "BATCH_CONV_BIAS_INT8_NCHW4_GEMM_DOTPROD"; + } +}; + +class BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemmPrecomp final + : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "BATCH_CONV_BIAS_INT8_NCHW4_IMPLICIT_GEMM_PRECOMP_DOTPROD"; + } + +private: + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; +}; + +class BatchConvBiasForwardImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + AlgoInt8NCHW4DotProdGemm int8_nchw4_gemm_dotprod; + AlgoInt8NCHW4DotProdImplicitGemmPrecomp int8_nchw4_implicit_gemm_dotprod; + + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/batch_conv_bias.cuh b/dnn/src/cuda/batch_conv_bias/batch_conv_bias.cuh new file mode 100644 index 00000000..d2c3b5da --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/batch_conv_bias.cuh @@ -0,0 +1,79 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/batch_conv_bias.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace batch_conv_bias { + +struct LaunchConfig { + int nr_threads_x; + int nr_threads_y; + int nr_threads_z; + int nr_blocks_x; + int nr_blocks_y; + int nr_blocks_z; + int smem_size_in_bytes; + LaunchConfig() + : nr_threads_x{1}, + nr_threads_y{1}, + nr_threads_z{1}, + nr_blocks_x{1}, + nr_blocks_y{1}, + nr_blocks_z{1}, + smem_size_in_bytes{1} {} +}; + +template +void do_batch_conv_bias_int8_gemm_ncdiv4hw4(const int8_t* d_src, + const int8_t* d_filter, + BiasVisitor bias, Epilogue epilogue, + const convolution::ConvParam& param, + float alpha, float beta, + cudaStream_t stream); + +template +void do_batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, int* workspace, + BiasVisitor bias, Epilogue epilogue, + const convolution::ConvParam& param, float alpha, float beta, + cudaStream_t stream); + +} // namespace batch_conv_bias +} // namespace cuda +} // namespace megdnn + +#define MARK_USED_VAR \ + MEGDNN_MARK_USED_VAR(n + ci + hi + wi + co + fh + fw + ho + wo + ph + pw + \ + sh + sw + dh + dw); + +#define UNPACK_BATCH_CONV_PARAMETER(_param) \ + size_t ph = _param.pad_h, pw = _param.pad_w; \ + size_t sh = _param.stride_h, sw = _param.stride_w; \ + size_t dh = _param.dilate_h, dw = _param.dilate_w; + +#define UNPACK_BATCH_CONV_BIAS_NCHW4_PARAM(_src, _filter, _dst, _param) \ + using Format = param::BatchConvBias::Format; \ + megdnn_assert(_param.format == Format::NCHW4); \ + size_t n = (_src)[0], ci = (_src)[1] * 4, hi = (_src)[2], wi = (_src)[3]; \ + size_t fh = (_filter)[3], fw = (_filter)[4]; \ + size_t co = (_dst)[1] * 4, ho = (_dst)[2], wo = (_dst)[3]; \ + UNPACK_BATCH_CONV_PARAMETER(_param); \ + MARK_USED_VAR + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/batch_conv_bias/gemm_int8_nchw4_dp4a.cpp new file mode 100644 index 00000000..11b8cb2c --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/gemm_int8_nchw4_dp4a.cpp @@ -0,0 +1,183 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/gemm_int8_nchw4_dp4a.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" +#include "src/cuda/batch_conv_bias/algo.h" +#include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" +#include "src/cuda/batch_conv_bias/opr_impl.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; +namespace { +template +void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, Epilogue epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream) { + void (*kern_wrapper)(const int8_t*, const int8_t*, BiasVisitor, Epilogue, + const ConvParam&, float, float, cudaStream_t); + using namespace batch_conv_bias; + int img_pixels = param.ho * param.wo; + if (img_pixels % 4 == 0) { + kern_wrapper = + do_batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128; + } else { + kern_wrapper = + do_batch_conv_bias_int8_gemm_ncdiv4hw4; + } + megdnn_assert(kern_wrapper != nullptr); + return kern_wrapper(d_src, d_filter, bias_visitor, epilogue, param, alpha, + beta, stream); +} + +template +void dispatch_nonlinear_mode(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, const int8_t* d_z, + int8_t* d_dst, const ConvParam& param, float alpha, + float beta, float gamma, float scale, + cudaStream_t stream, + param::BatchConvBias::NonlineMode nonlinear_mode) { + using NonlineMode = megdnn::param_enumv::BatchConvBias::NonlineMode; + Layout layout; + layout.init(param.n, param.co, param.ho, param.wo); + using namespace batch_conv_bias; +#define DISPATCH_CONV_INT8_EPILOGUE(_act_op) \ + do { \ + IConvEpilogue<_act_op> epilogue{d_dst, \ + d_z, \ + layout.batch_stride, \ + layout.channel_stride / 4, \ + layout.height_stride, \ + layout.width_stride, \ + gamma, \ + _act_op{scale, 1.f / scale}}; \ + dispatch_kernel>( \ + d_src, d_filter, bias_visitor, epilogue, param, alpha, beta, \ + stream); \ + return; \ + } while (0) +#define cb(_nonline_mode) \ + if (static_cast(nonlinear_mode) == NonlineMode::_nonline_mode) { \ + DISPATCH_CONV_INT8_EPILOGUE(Activation); \ + } + MEGDNN_FOREACH_NONLINE_MODE(cb); + megdnn_throw("unsupported nonlinear mode for conv bias operator"); +#undef cb +#undef DISPATCH_CONV_INT8_EPILOGUE +} + +#define INST(_visitor) \ + template void dispatch_nonlinear_mode<_visitor>( \ + const int8_t* d_src, const int8_t* d_filter, \ + _visitor bias_visitor, const int8_t* d_z, int8_t* d_dst, \ + const ConvParam& param, float alpha, float beta, float gamma, \ + float scale, cudaStream_t stream, \ + param::BatchConvBias::NonlineMode nonlinear_mode); + +INST(PerChannelBiasVisitor); + +#undef INST +} // namespace + +bool BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout.ndim <= 0) + return false; + + using Param = param::BatchConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + if (!conv_bias::check_bias_share_in_channel(args.bias_layout, param.format)) + return false; + if (param.format != Format::NCHW4) + return false; + UNPACK_BATCH_CONV_BIAS_NCHW4_PARAM(args.src_layout, args.filter_layout, + args.dst_layout, param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout.dtype, + filter_dtype = args.filter_layout.dtype, + bias_dtype = args.bias_layout.dtype, dst_dtype = args.dst_layout.dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // can be treat as gemm + available &= + (fh == 1 && sh == 1 && fw == 1 && sw == 1 && ph == 0 && pw == 0); + // only support sm_61 or later, platform should have fast native int8 + // support + available &= is_compute_capability_required(6, 1); + return available; +} + +size_t +BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdGemm::get_workspace_in_bytes( + const SizeArgs& /* args */) const { + return 0; +} + +void BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdGemm::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + UNPACK_BATCH_CONV_BIAS_NCHW4_PARAM(args.src_layout, args.filter_layout, + args.dst_layout, param); + auto&& stream = cuda_stream(args.opr->handle()); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout.dtype.param().scale, + filter_scale = + args.filter_layout.dtype.param().scale, + bias_scale = + args.bias_layout.dtype.param().scale, + dst_scale = args.dst_layout.dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout.ndim > 0) { + z_dev_ptr = args.z_tensor->compatible_ptr(); + float z_scale = args.z_layout.dtype.param().scale; + gamma = z_scale / dst_scale; + } + PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + dispatch_nonlinear_mode( + args.src_tensor->compatible_ptr(), + args.filter_tensor->compatible_ptr(), bias_visitor, + z_dev_ptr, args.dst_tensor->compatible_ptr(), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/helper.cu b/dnn/src/cuda/batch_conv_bias/helper.cu new file mode 100644 index 00000000..12890318 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/helper.cu @@ -0,0 +1,58 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/helper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batch_conv_bias/helper.cuh" +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace batch_conv_bias; + +namespace { +__global__ void kern_compute_offset(int* __restrict__ offset, + const convolution::ConvParam param) { + const int tid = threadIdx.x + blockDim.x * blockIdx.x; + const int img_pixels = param.ho * param.wo; + const int img_pixels_ru128 = DIVUP(img_pixels, 128) * 128; + const int filter_pixels = param.fh * param.fw; + if (tid >= img_pixels_ru128 * filter_pixels) + return; + const int filter_idx = tid / img_pixels; + const int img_idx = tid - img_pixels * filter_idx; + const int oh = img_idx / param.wo; + const int ow = img_idx - oh * param.wo; + const int kh = filter_idx / param.fw; + const int kw = filter_idx - param.fw * kh; + const int ih = param.sh * oh - param.ph + kh; + const int iw = param.sw * ow - param.pw + kw; + if (img_idx < img_pixels && ih >= 0 && ih < param.hi && iw >= 0 && + iw < param.wi) { + offset[tid] = ih * param.wi + iw; + } else { + offset[tid] = -1; + } +} +} // namespace + +void megdnn::cuda::batch_conv_bias::compute_offset( + int* offset, const convolution::ConvParam& param, cudaStream_t stream) { + uint32_t nr_threads = query_blocksize_for_kernel( + reinterpret_cast(kern_compute_offset)); + uint32_t img_pixels = param.ho * param.wo; + uint32_t img_pixels_ru128 = DIVUP(img_pixels, 128) * 128; + uint32_t filter_pixels = param.fh * param.fw; + uint32_t vthreads = img_pixels_ru128 * filter_pixels; + uint32_t nr_blocks = DIVUP(vthreads, nr_threads); + kern_compute_offset<<>>(offset, param); + after_kernel_launch(); +} + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/batch_conv_bias/helper.cuh b/dnn/src/cuda/batch_conv_bias/helper.cuh new file mode 100644 index 00000000..2b245eb2 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/helper.cuh @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/parameter.cuh" + +namespace megdnn { +namespace cuda { +namespace batch_conv_bias { +void compute_offset(int* offset, const convolution::ConvParam& param, + cudaStream_t stream); +} // namespace batched_conv2d +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/batch_conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/batch_conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp new file mode 100644 index 00000000..3daa220d --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -0,0 +1,168 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" +#include "src/common/utils.h" +#include "src/cuda/batch_conv_bias/algo.h" +#include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" +#include "src/cuda/batch_conv_bias/opr_impl.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; +namespace { +template +void dispatch_nonlinear_mode(const int8_t* d_src, const int8_t* d_filter, + int* d_workspace, BiasVisitor bias_visitor, + const int8_t* d_z, int8_t* d_dst, + const ConvParam& param, float alpha, float beta, + float gamma, float scale, cudaStream_t stream, + param::BatchConvBias::NonlineMode nonlinear_mode) { + using NonlineMode = megdnn::param_enumv::BatchConvBias::NonlineMode; + Layout layout; + layout.init(param.n, param.co, param.ho, param.wo); + using namespace batch_conv_bias; +#define DISPATCH_CONV_INT8_EPILOGUE(_act_op) \ + do { \ + IConvEpilogue<_act_op> epilogue{d_dst, \ + d_z, \ + layout.batch_stride, \ + layout.channel_stride / 4, \ + layout.height_stride, \ + layout.width_stride, \ + gamma, \ + _act_op{scale, 1.f / scale}}; \ + do_batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4< \ + BiasVisitor, IConvEpilogue<_act_op>>( \ + d_src, d_filter, d_workspace, bias_visitor, epilogue, param, \ + alpha, beta, stream); \ + return; \ + } while (0) +#define cb(_nonline_mode) \ + if (static_cast(nonlinear_mode) == NonlineMode::_nonline_mode) { \ + DISPATCH_CONV_INT8_EPILOGUE(Activation); \ + } + MEGDNN_FOREACH_NONLINE_MODE(cb); + megdnn_throw("unsupported nonlinear mode for conv bias operator"); +#undef cb +#undef DISPATCH_CONV_INT8_EPILOGUE +} + +#define INST(_visitor) \ + template void dispatch_nonlinear_mode<_visitor>( \ + const int8_t* d_src, const int8_t* d_filter, int* workspace, \ + _visitor bias_visitor, const int8_t* d_z, int8_t* d_dst, \ + const ConvParam& param, float alpha, float beta, float gamma, \ + float scale, cudaStream_t stream, \ + param::BatchConvBias::NonlineMode nonlinear_mode); + +INST(PerChannelBiasVisitor); + +#undef INST +} // namespace + +bool BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemmPrecomp:: + is_available(const SizeArgs& args) const { + if (args.bias_layout.ndim <= 0) + return false; + + using Param = param::BatchConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + if (!conv_bias::check_bias_share_in_channel(args.bias_layout, param.format)) + return false; + if (param.format != Format::NCHW4) + return false; + UNPACK_BATCH_CONV_BIAS_NCHW4_PARAM(args.src_layout, args.filter_layout, + args.dst_layout, param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout.dtype, + filter_dtype = args.filter_layout.dtype, + bias_dtype = args.bias_layout.dtype, dst_dtype = args.dst_layout.dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // TODO: support fh fw != 1 + available &= fh == 1 && fw == 1; + // only support sm_61 or later, platform should have fast native int8 + // support + available &= is_compute_capability_required(6, 1); + return available; +} + +size_t BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemmPrecomp:: + get_workspace_in_bytes(const SizeArgs& args) const { + auto&& param = args.opr->param(); + UNPACK_BATCH_CONV_BIAS_NCHW4_PARAM(args.src_layout, args.filter_layout, + args.dst_layout, param); + size_t img_pixels = ho * wo; + size_t img_pixels_ru128 = round_up(img_pixels, 128_z); + size_t filter_pixels = fh * fw; + return sizeof(int) * filter_pixels * img_pixels_ru128; +} + +void BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemmPrecomp::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + UNPACK_BATCH_CONV_BIAS_NCHW4_PARAM(args.src_layout, args.filter_layout, + args.dst_layout, param); + auto&& stream = cuda_stream(args.opr->handle()); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout.dtype.param().scale, + filter_scale = + args.filter_layout.dtype.param().scale, + bias_scale = + args.bias_layout.dtype.param().scale, + dst_scale = args.dst_layout.dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout.ndim > 0) { + z_dev_ptr = args.z_tensor->compatible_ptr(); + float z_scale = args.z_layout.dtype.param().scale; + gamma = z_scale / dst_scale; + } + PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + dispatch_nonlinear_mode( + args.src_tensor->compatible_ptr(), + args.filter_tensor->compatible_ptr(), + reinterpret_cast(args.workspace.raw_ptr), bias_visitor, + z_dev_ptr, args.dst_tensor->compatible_ptr(), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl b/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl new file mode 100644 index 00000000..caae4cdb --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl @@ -0,0 +1,194 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" +#include "src/cuda/batch_conv_bias/helper.cuh" +#include "src/cuda/convolution_helper/activation.cuh" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +//! dispatch macros +#define DISPATCH_mxnxk_CHK(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels >= hw_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = ((co_) + (ty_)-1) / (ty_); \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = ((hw_) + (tx_)-1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait_f1x1s1x1< \ + true, int, typename LdgTypeTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxnxk_CHK_small(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels >= hw_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = ((hw_) + (tx_)-1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait_f1x1s1x1< \ + true, int, typename LdgTypeTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxn_CHK(hw_, co_) \ + DISPATCH_mxnxk_CHK(hw_, co_, 4, 16, 8); \ + DISPATCH_mxnxk_CHK(hw_, co_, 8, 16, 8); \ + DISPATCH_mxnxk_CHK(hw_, co_, 16, 16, 8); + +#define DISPATCH_mxn_CHK_small(hw_) \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 4, 16, 8); \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 8, 16, 8); \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 16, 16, 8); + +#define DISPATCH_n_CHK(hw_) \ + DISPATCH_mxn_CHK_small(hw_); \ + DISPATCH_mxn_CHK(hw_, 32); \ + DISPATCH_mxn_CHK(hw_, 64); \ + DISPATCH_mxn_CHK(hw_, 128); +#define DISPATCH_m_CHK(co_) \ + DISPATCH_mxn_CHK(1, co_); \ + DISPATCH_mxn_CHK(32, co_); \ + DISPATCH_mxn_CHK(64, co_); \ + DISPATCH_mxn_CHK(128, co_); +namespace { +template +struct LdgTypeTrait; + +template <> +struct LdgTypeTrait<4> { + using ldg_type = int32_t; +}; + +template <> +struct LdgTypeTrait<8> { + using ldg_type = int2; +}; + +template <> +struct LdgTypeTrait<16> { + using ldg_type = int4; +}; + +template +void (*get_kern(const ConvParam& param, + batch_conv_bias::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + const int img_pixels = param.ho * param.wo; + + if (img_pixels >= 256 && param.co >= 256) { + DISPATCH_mxnxk_CHK(128, 128, 4, 16, 8); + DISPATCH_mxnxk_CHK(128, 128, 8, 16, 8); + DISPATCH_mxnxk_CHK(128, 128, 16, 16, 8); + } else if (img_pixels >= 256) { + DISPATCH_n_CHK(128); + } else if (param.co >= 256) { + DISPATCH_m_CHK(128); + } else { + DISPATCH_n_CHK(1); + DISPATCH_n_CHK(32); + DISPATCH_n_CHK(64); + DISPATCH_n_CHK(128); + } + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, float beta, + cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + batch_conv_bias::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl b/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl new file mode 100644 index 00000000..91b668a0 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl @@ -0,0 +1,260 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" +#include "src/cuda/batch_conv_bias/helper.cuh" +#include "src/cuda/convolution_helper/activation.cuh" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +//! dispatch macros +#define DISPATCH_mxnxk_CHK(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels >= hw_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = ((co_) + (ty_)-1) / (ty_); \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = ((hw_) + (tx_)-1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait_f1x1s1x1< \ + true, int4, typename LdgTypeTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxnxk_CHK_small(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels >= hw_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = ((hw_) + (tx_)-1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait_f1x1s1x1< \ + true, int4, typename LdgTypeTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxn_CHK(hw_, co_) \ + DISPATCH_mxnxk_CHK(hw_, co_, 4, 16, 8); \ + DISPATCH_mxnxk_CHK(hw_, co_, 8, 16, 8); \ + DISPATCH_mxnxk_CHK(hw_, co_, 16, 16, 8); + +#define DISPATCH_mxn_CHK_small(hw_) \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 4, 16, 8); \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 8, 16, 8); \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 16, 16, 8); + +#define DISPATCH_n_CHK(hw_) \ + DISPATCH_mxn_CHK_small(hw_); \ + DISPATCH_mxn_CHK(hw_, 32); \ + DISPATCH_mxn_CHK(hw_, 64); \ + DISPATCH_mxn_CHK(hw_, 128); +#define DISPATCH_m_CHK(co_) \ + DISPATCH_mxn_CHK(1, co_); \ + DISPATCH_mxn_CHK(32, co_); \ + DISPATCH_mxn_CHK(64, co_); \ + DISPATCH_mxn_CHK(128, co_); + +#define DISPATCH_mxnxk_NOCHK(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels % hw_ == 0) { \ + if (param.co % co_ == 0) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_) / (ty_); \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = (hw_) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait_f1x1s1x1< \ + false, int4, typename LdgTypeTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxn_NOCHK(hw_, co_) \ + DISPATCH_mxnxk_NOCHK(hw_, co_, 4, 16, 8); \ + DISPATCH_mxnxk_NOCHK(hw_, co_, 8, 16, 8); \ + DISPATCH_mxnxk_NOCHK(hw_, co_, 16, 16, 8) +#define DISPATCH_n_NOCHK(hw_) \ + DISPATCH_mxn_NOCHK(hw_, 32); \ + DISPATCH_mxn_NOCHK(hw_, 64); \ + DISPATCH_mxn_NOCHK(hw_, 128); +#define DISPATCH_m_NOCHK(co_) \ + DISPATCH_mxn_NOCHK(32, co_); \ + DISPATCH_mxn_NOCHK(64, co_); \ + DISPATCH_mxn_NOCHK(128, co_); +namespace { +template +struct LdgTypeTrait; + +template <> +struct LdgTypeTrait<4> { + using ldg_type = int32_t; +}; + +template <> +struct LdgTypeTrait<8> { + using ldg_type = int2; +}; + +template <> +struct LdgTypeTrait<16> { + using ldg_type = int4; +}; + +template +void (*get_kern(const ConvParam& param, + batch_conv_bias::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + const int img_pixels = param.ho * param.wo; + + if (img_pixels >= 256 && param.co >= 256) { + if (img_pixels % 128 == 0 && param.co % 128 == 0) { + DISPATCH_mxnxk_NOCHK(128, 128, 4, 16, 8); + DISPATCH_mxnxk_NOCHK(128, 128, 8, 16, 8); + DISPATCH_mxnxk_NOCHK(128, 128, 16, 16, 8); + } else { + DISPATCH_mxnxk_CHK(128, 128, 4, 16, 8); + DISPATCH_mxnxk_CHK(128, 128, 8, 16, 8); + DISPATCH_mxnxk_CHK(128, 128, 16, 16, 8); + } + } else if (img_pixels >= 256) { + if (img_pixels % 128 == 0 && param.co % 32 == 0) { + DISPATCH_n_NOCHK(128); + } else { + DISPATCH_n_CHK(128); + } + } else if (param.co >= 256) { + if (img_pixels % 32 == 0 && param.co % 128 == 0) { + DISPATCH_m_NOCHK(128); + } else { + DISPATCH_m_CHK(128); + } + } else { + DISPATCH_n_CHK(1); + DISPATCH_n_CHK(32); + DISPATCH_n_CHK(64); + DISPATCH_n_CHK(128); + DISPATCH_n_NOCHK(32); + DISPATCH_n_NOCHK(64); + DISPATCH_n_NOCHK(128); + } + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::batch_conv_bias:: + do_batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + batch_conv_bias::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl b/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl new file mode 100644 index 00000000..02c97f1d --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl @@ -0,0 +1,198 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" +#include "src/cuda/batch_conv_bias/helper.cuh" +#include "src/cuda/convolution_helper/activation.cuh" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +//! dispatch macros +#define DISPATCH_mxnxk_CHK(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels >= hw_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = ((co_) + (ty_)-1) / (ty_); \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = ((hw_) + (tx_)-1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel_precomp_offset< \ + ConvTrait, BiasVisitor, Epilogue>; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxnxk_CHK_small(hw_, co_, ci_, tx_, ty_) \ + if (img_pixels >= hw_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = 1; \ + static constexpr int reg_width = ((hw_) + (tx_)-1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IBatchConvTrait::ldg_type, \ + RegBlockConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel_precomp_offset< \ + ConvTrait, BiasVisitor, Epilogue>; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = DIVUP( \ + img_pixels, ConvTrait::DataTileCount:: \ + block_tile_out_height_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.nr_blocks_z = param.n; \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_mxn_CHK(hw_, co_) \ + DISPATCH_mxnxk_CHK(hw_, co_, 4, 16, 8); \ + DISPATCH_mxnxk_CHK(hw_, co_, 8, 16, 8); \ + DISPATCH_mxnxk_CHK(hw_, co_, 16, 16, 8); +#define DISPATCH_mxn_CHK_small(hw_) \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 4, 16, 8); \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 8, 16, 8); \ + DISPATCH_mxnxk_CHK_small(hw_, 4, 16, 16, 8); +#define DISPATCH_n_CHK(hw_) \ + DISPATCH_mxn_CHK_small(hw_); \ + DISPATCH_mxn_CHK(hw_, 32); \ + DISPATCH_mxn_CHK(hw_, 64); \ + DISPATCH_mxn_CHK(hw_, 128); +#define DISPATCH_m_CHK(co_) \ + DISPATCH_mxn_CHK(1, co_); \ + DISPATCH_mxn_CHK(32, co_); \ + DISPATCH_mxn_CHK(64, co_); \ + DISPATCH_mxn_CHK(128, co_); +namespace { +template +struct LdgTypeTrait; + +template <> +struct LdgTypeTrait<4> { + using ldg_type = int32_t; +}; + +template <> +struct LdgTypeTrait<8> { + using ldg_type = int2; +}; + +template <> +struct LdgTypeTrait<16> { + using ldg_type = int4; +}; + +template +void (*get_kern(const ConvParam& param, + batch_conv_bias::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, + const int* __restrict__ offset, BiasVisitor, Epilogue, ConvParam, float, + float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + const int* __restrict__, BiasVisitor, Epilogue, ConvParam, + float, float); + kern = nullptr; + const int img_pixels = param.ho * param.wo; + if (img_pixels >= 256 && param.co >= 256) { + DISPATCH_mxnxk_CHK(128, 128, 4, 16, 8); + DISPATCH_mxnxk_CHK(128, 128, 8, 16, 8); + DISPATCH_mxnxk_CHK(128, 128, 16, 16, 8); + } else if (img_pixels >= 256) { + DISPATCH_n_CHK(128); + } else if (param.co >= 256) { + DISPATCH_m_CHK(128); + } else { + DISPATCH_n_CHK(1); + DISPATCH_n_CHK(32); + DISPATCH_n_CHK(64); + DISPATCH_n_CHK(128); + } + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::batch_conv_bias:: + do_batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, int* workspace, + BiasVisitor bias, Epilogue epilogue, const ConvParam& param, + float alpha, float beta, cudaStream_t stream) { + compute_offset(workspace, param, stream); + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + const int* __restrict__, BiasVisitor, Epilogue, ConvParam, + float, float); + batch_conv_bias::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, workspace, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_hswish.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_hswish.cu new file mode 100644 index 00000000..a8fc2574 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_id.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_id.cu new file mode 100644 index 00000000..e1f8ab0c --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_relu.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_relu.cu new file mode 100644 index 00000000..57b72571 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4_ldg_128>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_hswish.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_hswish.cu new file mode 100644 index 00000000..c30bc345 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_id.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_id.cu new file mode 100644 index 00000000..2aee0207 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_relu.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_relu.cu new file mode 100644 index 00000000..6ced3ae9 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_gemm_ncdiv4hw4_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_gemm_ncdiv4hw4.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_gemm_ncdiv4hw4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_hswish.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_hswish.cu new file mode 100644 index 00000000..6207fd63 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_hswish.cu @@ -0,0 +1,24 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4>>( + const int8_t* d_src, + const int8_t* d_filter, +int* d_workspace, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_id.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_id.cu new file mode 100644 index 00000000..026640fa --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_id.cu @@ -0,0 +1,24 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4>>( + const int8_t* d_src, + const int8_t* d_filter, +int* d_workspace, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_relu.cu b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_relu.cu new file mode 100644 index 00000000..c5dfd679 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_relu.cu @@ -0,0 +1,24 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/int8/kimpl/batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_batch_cuda_conv_bias_kern_impls.py +#include "../batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4.cuinl" + +template void megdnn::cuda::batch_conv_bias::do_batch_conv_bias_int8_implicit_gemm_precomp_ncdiv4hw4>>( + const int8_t* d_src, + const int8_t* d_filter, +int* d_workspace, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/batch_conv_bias/opr_impl.cpp b/dnn/src/cuda/batch_conv_bias/opr_impl.cpp new file mode 100644 index 00000000..dedc52b9 --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/opr_impl.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batch_conv_bias/opr_impl.h" +#include "src/common/algo_chooser.h" +#include "src/cuda/batch_conv_bias/algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +/* ============== BatchConvBiasForwardImpl ============== */ +BatchConvBiasForwardImpl::Algorithm* +BatchConvBiasForwardImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, src, filter, bias, z, dst); + if (sm_algo_pack.int8_nchw4_gemm_dotprod.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.int8_nchw4_gemm_dotprod; + } + if (sm_algo_pack.int8_nchw4_implicit_gemm_dotprod.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.int8_nchw4_implicit_gemm_dotprod; + } + megdnn_throw(megdnn_mangle( + ssprintf("no %s batch conv bias algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", + args.to_string().c_str(), workspace_limit_in_bytes))); +} + +std::vector +BatchConvBiasForwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) { + AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; + return megdnn::get_all_algorithms(args); +} + +size_t BatchConvBiasForwardImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) { + AlgoBase::SizeArgs args(this, src, filter, bias, z, dst); + return get_algorithm(this, src, filter, bias, z, dst) + ->get_workspace_in_bytes(args); +} + +void BatchConvBiasForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); + auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, + z.layout, dst.layout); + algo->check_workspace(args, workspace).exec(args); +} + +const char* BatchConvBiasForwardImpl::get_algorithm_set_name() const { + return "CUDA_BATCH_CONV_BIAS"; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_conv_bias/opr_impl.h b/dnn/src/cuda/batch_conv_bias/opr_impl.h new file mode 100644 index 00000000..4ad3faaa --- /dev/null +++ b/dnn/src/cuda/batch_conv_bias/opr_impl.h @@ -0,0 +1,57 @@ +/** + * \file dnn/src/cuda/batch_conv_bias/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +class BatchConvBiasForwardImpl : public BatchConvBiasForward { +public: + using BatchConvBiasForward::BatchConvBiasForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) override; + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoInt8NCHW4DotProdGemm; + class AlgoInt8NCHW4DotProdImplicitGemmPrecomp; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_normalization/opr_impl.cpp b/dnn/src/cuda/batch_normalization/opr_impl.cpp new file mode 100644 index 00000000..faa054ae --- /dev/null +++ b/dnn/src/cuda/batch_normalization/opr_impl.cpp @@ -0,0 +1,120 @@ +/** + * \file dnn/src/cuda/batch_normalization/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./opr_impl.h" + +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +namespace batch_normalization { + +void BNTensorDescHolder::setup(const TensorLayout& x, + const ParamDim& param_dim) { + TensorShape xy_shape(x); + + switch (param_dim) { + case ParamDim::DIM_11HW: + // xy: N, C, H, W --> (N*C), 1, H, W + xy_shape.shape[0] = xy_shape.shape[0] * xy_shape.shape[1]; + xy_shape.shape[1] = 1; + bn_mode = CUDNN_BATCHNORM_PER_ACTIVATION; + break; + case ParamDim::DIM_1CHW: + bn_mode = CUDNN_BATCHNORM_PER_ACTIVATION; + break; + case ParamDim::DIM_1C11: + bn_mode = CUDNN_BATCHNORM_SPATIAL; + break; + default: + megdnn_throw(megdnn_mangle( + "Unknown param dim type of batch normalization.")); + } + xy_desc.set(TensorLayout(xy_shape, x.dtype)); + param_desc.set(xy_desc.desc, bn_mode); +} + +} // namespace batch_normalization + +void BNForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in bn_scale, + _megdnn_tensor_in bn_bias, _megdnn_tensor_out mean, + _megdnn_tensor_out variance, + _megdnn_tensor_out batch_mean, + _megdnn_tensor_out batch_inv_variance, + _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, bn_scale.layout, bn_bias.layout, mean.layout, + variance.layout, batch_mean.layout, batch_inv_variance.layout, + dst.layout, workspace.size); + auto handle = cudnn_handle(this->handle()); + m_tensor_desc.setup(src.layout, m_param.param_dim); + + float alpha = 1.0f, beta = 0.0f; + switch (m_param.fwd_mode) { + case param::BN::FwdMode::TRAINING: + cudnn_check(cudnnBatchNormalizationForwardTraining( + handle, m_tensor_desc.bn_mode, + &alpha, &beta, + m_tensor_desc.xy_desc.desc, // xDesc + src.raw_ptr, // x + m_tensor_desc.xy_desc.desc, // yDesc + dst.raw_ptr, // y + m_tensor_desc.param_desc.desc, // bnScaleBiasMeanVarDesc + bn_scale.raw_ptr, bn_bias.raw_ptr, m_param.avg_factor, + mean.raw_ptr, variance.raw_ptr, m_param.epsilon, + batch_mean.raw_ptr, batch_inv_variance.raw_ptr)); + + break; + case param::BN::FwdMode::INFERENCE: + cudnn_check(cudnnBatchNormalizationForwardInference( + handle, m_tensor_desc.bn_mode, + &alpha, &beta, + m_tensor_desc.xy_desc.desc, src.raw_ptr, + m_tensor_desc.xy_desc.desc, dst.raw_ptr, + m_tensor_desc.param_desc.desc, bn_scale.raw_ptr, + bn_bias.raw_ptr, mean.raw_ptr, variance.raw_ptr, + m_param.epsilon)); + break; + default: + megdnn_throw(megdnn_mangle( + "Unknown forward mode type of batch normalization.")); + } +} + +void BNBackwardImpl::exec(_megdnn_tensor_in x, _megdnn_tensor_in dy, + _megdnn_tensor_in saved_batch_mean, + _megdnn_tensor_in saved_batch_inv_variance, + _megdnn_tensor_in bn_scale, + _megdnn_tensor_out d_bn_scale, + _megdnn_tensor_out d_bn_bias, + _megdnn_tensor_out dx, _megdnn_workspace workspace) { + check_exec(x.layout, dy.layout, saved_batch_mean.layout, + saved_batch_inv_variance.layout, bn_scale.layout, + d_bn_scale.layout, d_bn_bias.layout, dx.layout, + workspace.size); + auto handle = cudnn_handle(this->handle()); + m_tensor_desc.setup(x.layout, m_param.param_dim); + + float alpha = 1.0, beta = 0.0; + cudnn_check(cudnnBatchNormalizationBackward( + handle, m_tensor_desc.bn_mode, + &alpha, &beta, &alpha, &beta, + m_tensor_desc.xy_desc.desc, x.raw_ptr, + m_tensor_desc.xy_desc.desc, dy.raw_ptr, + m_tensor_desc.xy_desc.desc, dx.raw_ptr, + m_tensor_desc.param_desc.desc, bn_scale.raw_ptr, + d_bn_scale.raw_ptr, d_bn_bias.raw_ptr, m_param.epsilon, + saved_batch_mean.raw_ptr, saved_batch_inv_variance.raw_ptr)); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batch_normalization/opr_impl.h b/dnn/src/cuda/batch_normalization/opr_impl.h new file mode 100644 index 00000000..fc6e37ac --- /dev/null +++ b/dnn/src/cuda/batch_normalization/opr_impl.h @@ -0,0 +1,79 @@ +/** + * \file dnn/src/cuda/batch_normalization/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/cuda/cudnn_wrapper.h" + +namespace megdnn { +namespace cuda { + +namespace batch_normalization { + +struct BNTensorDescHolder { + using ParamDim = param::BN::ParamDim; + + TensorDesc xy_desc; + BNParamDesc param_desc; + cudnnBatchNormMode_t bn_mode; + + void setup(const TensorLayout& x, const ParamDim& param_dim); +}; + +} // namespace batch_normalization + +class BNForwardImpl final : public BNForward { +public: + using BNForward::BNForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in bn_scale, + _megdnn_tensor_in bn_bias, _megdnn_tensor_out mean, + _megdnn_tensor_out variance, _megdnn_tensor_out batch_mean, + _megdnn_tensor_out batch_inv_variance, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, + const TensorLayout&, + const TensorLayout&) override { + return 0; + } + +private: + batch_normalization::BNTensorDescHolder m_tensor_desc; +}; + +class BNBackwardImpl final : public BNBackward { +public: + using BNBackward::BNBackward; + void exec(_megdnn_tensor_in x, _megdnn_tensor_in dy, + _megdnn_tensor_in saved_batch_mean, + _megdnn_tensor_in saved_batch_inv_variance, + _megdnn_tensor_in bn_scale, _megdnn_tensor_out d_bn_scale, + _megdnn_tensor_out d_bn_bias, _megdnn_tensor_out dx, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, + const TensorLayout&, + const TensorLayout&) override { + return 0; + } + +private: + batch_normalization::BNTensorDescHolder m_tensor_desc; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batched_matrix_mul/algo.cpp b/dnn/src/cuda/batched_matrix_mul/algo.cpp new file mode 100644 index 00000000..da8d396b --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/algo.cpp @@ -0,0 +1,63 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include +#include "src/cuda/utils.h" +#if CUDA_VERSION >= 10010 +#include +#endif + +using namespace megdnn; +using namespace cuda; + +BatchedMatrixMulForwardImpl::AlgoPack BatchedMatrixMulForwardImpl::sm_algo_pack; + +std::string BatchedMatrixMulForwardImpl::AlgoBase::SizeArgs::to_string() const { + auto&& param = opr->param(); + size_t m = layout_a.shape[0], n = layout_b.shape[1], + k = layout_a.shape[param.transposeA ? 0 : 1]; + MEGDNN_MARK_USED_VAR(m); + MEGDNN_MARK_USED_VAR(n); + MEGDNN_MARK_USED_VAR(k); + return megdnn_mangle(ssprintf( + "A={%zux%zu},B={%zux%zu},C={%zux%zu},Transpose A=%d,Transpose " + "B=%d,ldA=%zu,ldB=%zu,ldC=%zu", + m, k, k, n, m, n, param.transposeA, param.transposeB, + layout_a.stride[0], layout_b.stride[0], layout_c.stride[0])); +} + +BatchedMatrixMulForwardImpl::AlgoBase::SizeArgs::SizeArgs( + BatchedMatrixMulForwardImpl* o, const TensorLayout& A, + const TensorLayout& B, const TensorLayout& C) + : opr(o), layout_a(A), layout_b(B), layout_c(C){}; + +BatchedMatrixMulForwardImpl::AlgoBase::ExecArgs::ExecArgs( + BatchedMatrixMulForwardImpl* o, _megdnn_tensor_in A, + _megdnn_tensor_in B, _megdnn_tensor_in C, _megdnn_workspace workspace) + : SizeArgs(o, A.layout, B.layout, C.layout), + tensor_a{A}, + tensor_b{B}, + tensor_c{C}, + workspace{workspace} {} + +BatchedMatrixMulForwardImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&cublas); +#if CUDA_VERSION >= 10010 + all_algos.push_back(&cublasLt); +#endif + all_algos.push_back(&int8x8x32); + for (auto& algo : mm_pack.all_algos) { + brute_force_algos.emplace_back(AlgoBruteForce(algo)); + } + for (auto& algo : brute_force_algos) { + all_algos.push_back(&algo); + } +} diff --git a/dnn/src/cuda/batched_matrix_mul/algo.h b/dnn/src/cuda/batched_matrix_mul/algo.h new file mode 100644 index 00000000..83597f5d --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/algo.h @@ -0,0 +1,141 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include +#include "megdnn/dtype.h" +#include "megdnn/oprs.h" +#include "src/common/utils.h" +#include "src/cuda/batched_matrix_mul/opr_impl.h" +#include "src/cuda/matrix_mul/cublasLt_wrapper.h" +#if CUDA_VERSION >= 10010 +#include +#endif + +namespace megdnn { +namespace cuda { + +class BatchedMatrixMulForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + BatchedMatrixMulForwardImpl* opr; + TensorLayout layout_a, layout_b, layout_c; + std::string to_string() const; + SizeArgs(BatchedMatrixMulForwardImpl* o, const TensorLayout& A, + const TensorLayout& B, const TensorLayout& C); + bool can_be_treated_as_int8x8x32() const { + return layout_a.dtype.enumv() == layout_b.dtype.enumv() && + (layout_a.dtype.enumv() == DTypeEnum::Int8 || + layout_a.dtype.enumv() == DTypeEnum::QuantizedS8) && + (layout_c.dtype.enumv() == DTypeEnum::Int32 || + layout_c.dtype.enumv() == DTypeEnum::QuantizedS32) && + opr->param().format == param::MatrixMul::Format::DEFAULT; + } + }; + struct ExecArgs : public SizeArgs { + TensorND tensor_a, tensor_b, tensor_c; + Workspace workspace; + ExecArgs(BatchedMatrixMulForwardImpl* o, _megdnn_tensor_in A, + _megdnn_tensor_in B, _megdnn_tensor_in C, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + virtual const char* name() const = 0; + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "batched matrix mul fwd algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; +class BatchedMatrixMulForwardImpl::AlgoBruteForce final + : public BatchedMatrixMulForwardImpl::AlgoBase { + using Param = MatrixMulForward::Param; + +private: + std::string m_name; + MatrixMulForwardImpl::AlgoBase* m_algorithm = nullptr; + WorkspaceBundle get_workspace_bundle(); + +public: + AlgoBruteForce(MatrixMulForwardImpl::AlgoBase* algo); + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override; + void exec(const ExecArgs& args) const final; + bool is_reproducible() const override { return true; } + const char* name() const override { return m_name.c_str(); } +}; +class BatchedMatrixMulForwardImpl::AlgoCublas final + : public BatchedMatrixMulForwardImpl::AlgoBase { +public: + AlgoCublas() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override; + void exec(const ExecArgs& args) const final; + bool is_reproducible() const override { return true; } + const char* name() const override { return "CUBLAS"; } +}; +#if CUDA_VERSION >= 10010 +class BatchedMatrixMulForwardImpl::AlgoCublasLt final : public AlgoBase { +public: + AlgoCublasLt() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override; + void exec(const ExecArgs& args) const final; + bool is_reproducible() const override { return true; } + const char* name() const override { return "CUBLAS_LT"; } +}; +#endif +class BatchedMatrixMulForwardImpl::AlgoInt8x8x32 final + : public BatchedMatrixMulForwardImpl::AlgoBase { +public: + AlgoInt8x8x32() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override; + void exec(const ExecArgs& args) const final; + bool is_reproducible() const override { return true; } + const char* name() const override { return "INT8x8x32"; } +}; +class BatchedMatrixMulForwardImpl::AlgoPack { + MatrixMulForwardImpl::AlgoPack mm_pack; + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + AlgoCublas cublas; +#if CUDA_VERSION >= 10010 + AlgoCublasLt cublasLt; +#endif + AlgoInt8x8x32 int8x8x32; + std::vector all_algos; + std::vector brute_force_algos; +}; +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/batched_matrix_mul/brute_force.cpp b/dnn/src/cuda/batched_matrix_mul/brute_force.cpp new file mode 100644 index 00000000..0da6aa14 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/brute_force.cpp @@ -0,0 +1,67 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/brute_force.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +BatchedMatrixMulForwardImpl::AlgoBruteForce::AlgoBruteForce( + MatrixMulForwardImpl::AlgoBase* algo) + : m_algorithm(algo) { + m_name = ssprintf("BRUTE_FORCE-%s", algo->name()); +} +bool BatchedMatrixMulForwardImpl::AlgoBruteForce::is_available( + const SizeArgs& args) const { + MatrixMulForwardImpl mm{args.opr->handle()}; + mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB}; + mm.execution_policy() = {m_algorithm}; + + auto mm_layout_a = args.layout_a.remove_axis(0); + auto mm_layout_b = args.layout_b.remove_axis(0); + auto mm_layout_c = args.layout_c.remove_axis(0); + + MatrixMulForwardImpl::AlgoBase::SizeArgs mm_args{&mm, mm_layout_a, + mm_layout_b, mm_layout_c}; + return m_algorithm->is_available(mm_args); +} +size_t BatchedMatrixMulForwardImpl::AlgoBruteForce::get_workspace_in_bytes( + const SizeArgs& args) const { + auto mm_opr = args.opr->handle()->create_operator(); + mm_opr->param() = {args.opr->param().transposeA, + args.opr->param().transposeB}; + mm_opr->execution_policy() = {m_algorithm}; + + return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b, + args.layout_c); +} +void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec( + const ExecArgs& args) const { + auto N = args.layout_a.shape[0]; + auto&& mm_opr = args.opr->handle()->create_operator(); + mm_opr->param() = {args.opr->param().transposeA, + args.opr->param().transposeB}; + mm_opr->execution_policy() = {m_algorithm}; + rep(n, N) { + TensorND A_, B_, C_; + auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { + out.raw_ptr = static_cast(static_cast(in.raw_ptr) + + n * in.layout.stride[0] * + in.layout.dtype.size()); + out.layout = in.layout.remove_axis(0); + }; + tensor_n_from_batch(args.tensor_a, A_); + tensor_n_from_batch(args.tensor_b, B_); + tensor_n_from_batch(args.tensor_c, C_); + mm_opr->exec(A_, B_, C_, args.workspace); + } +} diff --git a/dnn/src/cuda/batched_matrix_mul/cublas.cpp b/dnn/src/cuda/batched_matrix_mul/cublas.cpp new file mode 100644 index 00000000..84836e0b --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/cublas.cpp @@ -0,0 +1,139 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/cublas.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "./helper.cuh" +#include "src/common/utils.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace batched_matrix_mul; + +bool BatchedMatrixMulForwardImpl::AlgoCublas::is_available( + const SizeArgs& args) const { + auto dtype = args.layout_a.dtype; + auto&& param = args.opr->param(); + auto&& handle = concrete_handle(args.opr->handle()); + if (dtype == dtype::Float32()) + return true; + if (dtype != dtype::Float16()) + return false; + else { + auto&& cuda_cap = handle->device_prop(); + if (param.compute_mode == Param::ComputeMode::FLOAT32) { +#if CUDART_VERSION >= 9010 + return cuda_cap.major >= 5; +#else + MEGDNN_MARK_USED_VAR(cuda_cap); + return false; +#endif + } else { +#if CUDART_VERSION >= 9000 + return cuda_cap.major >= 6; +#else + MEGDNN_MARK_USED_VAR(cuda_cap); + return false; +#endif + } + } +} +size_t BatchedMatrixMulForwardImpl::AlgoCublas::get_workspace_in_bytes( + const SizeArgs& args) const { + return args.layout_a.shape[0] * 3 * sizeof(uintptr_t); +} +void BatchedMatrixMulForwardImpl::AlgoCublas::exec(const ExecArgs& args) const { + auto param = args.opr->param(); + auto dtype = args.layout_a.dtype; + auto handle = concrete_handle(args.opr->handle()); + auto cublas_handle = handle->cublas_handle(); + auto stream = cuda_stream(handle); + auto batch = args.layout_a.shape[0]; + auto m = args.layout_c.shape[1], n = args.layout_c.shape[2]; + auto k = args.layout_a.shape[param.transposeA ? 1 : 2]; + auto workspace = args.workspace; + + uintptr_t* As = static_cast(static_cast( + workspace.raw_ptr + 0 * batch * sizeof(uintptr_t))); + uintptr_t* Bs = static_cast(static_cast( + workspace.raw_ptr + 1 * batch * sizeof(uintptr_t))); + uintptr_t* Cs = static_cast(static_cast( + workspace.raw_ptr + 2 * batch * sizeof(uintptr_t))); + + arange(As, reinterpret_cast(args.tensor_a.raw_ptr), + args.layout_a.stride[0] * dtype.size(), batch, stream); + arange(Bs, reinterpret_cast(args.tensor_b.raw_ptr), + args.layout_b.stride[0] * dtype.size(), batch, stream); + arange(Cs, reinterpret_cast(args.tensor_c.raw_ptr), + args.layout_c.stride[0] * dtype.size(), batch, stream); + + auto io32_c32 = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); + cublas_check(cublasSgemmBatched( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, + reinterpret_cast(Bs), + args.layout_b.stride[1], + reinterpret_cast(As), + args.layout_a.stride[1], zero, + reinterpret_cast(Cs), args.layout_c.stride[1], + batch)); + }; + +#if CUDART_VERSION >= 9010 + auto io16_c32 = [&]() { + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH)); + auto zero = handle->zero_device(); + auto one = handle->one_device(); + cublas_check(cublasGemmBatchedEx( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, + reinterpret_cast(Bs), CUDA_R_16F, + args.layout_b.stride[1], reinterpret_cast(As), + CUDA_R_16F, args.layout_a.stride[1], zero, + reinterpret_cast(Cs), CUDA_R_16F, + args.layout_c.stride[1], batch, CUDA_R_32F, + CUBLAS_GEMM_DEFAULT)); + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH)); + }; +#endif + +#if CUDART_VERSION >= 9000 + auto io16_c16 = [&]() { + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH)); + auto zero = handle->zero_device_h(); + auto one = handle->one_device_h(); + cublas_check(cublasHgemmBatched( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, + reinterpret_cast(Bs), args.layout_b.stride[1], + reinterpret_cast(As), args.layout_a.stride[1], + zero, reinterpret_cast<__half**>(Cs), args.layout_c.stride[1], + batch)); + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH)); + }; +#endif + + if (dtype == dtype::Float32()) { + io32_c32(); + } else { + if (param.compute_mode == Param::ComputeMode::FLOAT32) { +#if CUDART_VERSION >= 9010 + io16_c32(); +#endif + } else { +#if CUDART_VERSION >= 9000 + io16_c16(); +#endif + } + } +} diff --git a/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp b/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp new file mode 100644 index 00000000..9d3f9620 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp @@ -0,0 +1,147 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/cublas_lt.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" +#include "src/cuda/matrix_mul/cublasLt_wrapper.h" + +using namespace megdnn; +using namespace cuda; + +#if CUDA_VERSION >= 10010 +static inline CUBLASLTMatmulDesc::SizeArgs from_local_size_args( + const BatchedMatrixMulForwardImpl::AlgoBase::SizeArgs& args) { + auto&& param = args.opr->param(); + auto&& handle = concrete_handle(args.opr->handle()); + bool transA = param.transposeA; + bool transB = param.transposeB; + return {handle, transA, transB, + args.layout_a, args.layout_b, args.layout_c}; +} +bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( + const SizeArgs& args) const { + auto cublasLt_args = from_local_size_args(args); + auto&& dev_prop = current_device_prop(); + bool is_dev_support = dev_prop.major >= 7; + bool res = is_dev_support && CUBLASLTMatmulDesc(cublasLt_args, true) + .is_available(cublasLt_args, INT_MAX); + return res; +} +size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( + const SizeArgs& args) const { + auto cublasLt_args = from_local_size_args(args); + cublasLtMatmulAlgo_t algo; + CUBLASLTMatmulDesc desc(cublasLt_args, true); + desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); + return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); +} +void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( + const ExecArgs& args) const { + auto cublasLt_args = from_local_size_args(args); + cublasLtMatmulAlgo_t algo; + CUBLASLTMatmulDesc desc(cublasLt_args, true); + desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); + auto ws_bundle = desc.get_workspace_bundle(cublasLt_args, algo); + auto&& handle = concrete_handle(args.opr->handle()); + auto&& stream = handle->stream(); + auto&& cublasLt_handle = handle->cublasLt_handle(); + auto batched_hgemm = [&]() { + auto zero_half = handle->zero_device_h(); + auto one_half = handle->one_device_h(); + megdnn_assert(ws_bundle.nr_workspace() == 1, + "workspace bundle size should be 1(ws_algo)"); + cublas_check(cublasLtMatmul( + cublasLt_handle, desc.matmul_desc, one_half, + static_cast(args.tensor_b.raw_ptr), + desc.layout_b, + static_cast(args.tensor_a.raw_ptr), + desc.layout_a, zero_half, + static_cast(args.tensor_c.raw_ptr), + desc.layout_c, static_cast<__half*>(args.tensor_c.raw_ptr), + desc.layout_c, &algo, ws_bundle.get(0), ws_bundle.get_size(0), + stream)); + }; + auto batched_sgemm = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); + auto dev_b = + (desc.dt_b == CUDA_R_16F) + ? static_cast(args.tensor_b.ptr()) + : static_cast(args.tensor_b.ptr()); + auto dev_a = + (desc.dt_a == CUDA_R_16F) + ? static_cast(args.tensor_a.ptr()) + : static_cast(args.tensor_a.ptr()); + auto dev_c = static_cast(args.tensor_c.raw_ptr); + megdnn_assert(ws_bundle.nr_workspace() == 1, + "workspace bundle size should be 1(ws_algo)"); + cublas_check(cublasLtMatmul(cublasLt_handle, desc.matmul_desc, one, + dev_b, desc.layout_b, dev_a, desc.layout_a, + zero, dev_c, desc.layout_c, dev_c, + desc.layout_c, &algo, ws_bundle.get(0), + ws_bundle.get_size(0), stream)); + }; + auto batched_igemm = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); + megdnn_assert( + ws_bundle.nr_workspace() == 4, + "workspace bundle size should be 4(ws_algo, ws_a, ws_b, ws_c)"); + void* ws_b = ws_bundle.get(1); + void* ws_a = ws_bundle.get(2); + void* ws_c = ws_bundle.get(3); + int32_t pm = CUBLAS_POINTER_MODE_DEVICE; + cublasOperation_t trans_a = CUBLAS_OP_T, trans_c = CUBLAS_OP_N; + cublasLtMatrixTransformDesc_t transform_desc = nullptr; + cublas_check( + cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F)); + cublas_check(cublasLtMatrixTransformDescSetAttribute( + transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, + &pm, sizeof(pm))); + cublas_check(cublasLtMatrixTransform( + cublasLt_handle, transform_desc, one, args.tensor_b.raw_ptr, + desc.layout_b, zero, nullptr, nullptr, ws_b, + desc.layout_trans_b, stream)); + cublas_check(cublasLtMatrixTransformDescSetAttribute( + transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_a, + sizeof(trans_a))); + cublas_check(cublasLtMatrixTransform( + cublasLt_handle, transform_desc, one, args.tensor_a.raw_ptr, + desc.layout_a, zero, nullptr, nullptr, ws_a, + desc.layout_trans_a, stream)); + cublas_check(cublasLtMatmul( + cublasLt_handle, desc.matmul_desc, one, ws_b, + desc.layout_trans_b, ws_a, desc.layout_trans_a, zero, ws_c, + desc.layout_trans_c, ws_c, desc.layout_trans_c, &algo, + ws_bundle.get(0), ws_bundle.get_size(0), stream)); + cublas_check(cublasLtMatrixTransformDescSetAttribute( + transform_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_c, + sizeof(trans_c))); + cublas_check(cublasLtMatrixTransform( + cublasLt_handle, transform_desc, one, ws_c, desc.layout_trans_c, + zero, nullptr, nullptr, args.tensor_c.raw_ptr, desc.layout_c, + stream)); + cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); + }; + + ws_bundle.set(args.workspace.raw_ptr); + if (desc.dt_compute == CUDA_R_32I) { + batched_igemm(); + } else if (desc.dt_compute == CUDA_R_16F) { + batched_hgemm(); + } else if (desc.dt_compute == CUDA_R_32F) { + batched_sgemm(); + } else { + megdnn_throw( + megdnn_mangle("compute_type must be int32/float16/float32")); + } +} +#endif diff --git a/dnn/src/cuda/batched_matrix_mul/helper.cu b/dnn/src/cuda/batched_matrix_mul/helper.cu new file mode 100644 index 00000000..959a7846 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/helper.cu @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/helper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batched_matrix_mul/helper.cuh" + +namespace { + +template +__global__ void kernel(T *Xs, T start, uint32_t step, uint32_t n) +{ + uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + Xs[i] = start + i*step; + } +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace batched_matrix_mul { + +template +void arange(T *Xs, T start, uint32_t step, uint32_t n, cudaStream_t stream) +{ + uint32_t threads = NR_THREADS; + uint32_t blocks = DIVUP(n, threads); + kernel<<>>(Xs, start, step, n); + after_kernel_launch(); +} + +template void arange(uintptr_t *, uintptr_t, + uint32_t, uint32_t, cudaStream_t); + +} // namespace batched_matrix_mul +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/batched_matrix_mul/helper.cuh b/dnn/src/cuda/batched_matrix_mul/helper.cuh new file mode 100644 index 00000000..a7837770 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/helper.cuh @@ -0,0 +1,25 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace batched_matrix_mul { + +template +void arange(T* Xs, T start, uint32_t step, uint32_t n, cudaStream_t stream); + +} // namespace batched_matrix_mul +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batched_matrix_mul/int8x8x32.cpp b/dnn/src/cuda/batched_matrix_mul/int8x8x32.cpp new file mode 100644 index 00000000..5d466235 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/int8x8x32.cpp @@ -0,0 +1,58 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/int8x8x32.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./int8x8x32.cuh" +#include +#include "./algo.h" +#include "./helper.cuh" +#include "src/common/utils.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace batched_matrix_mul; + +bool BatchedMatrixMulForwardImpl::AlgoInt8x8x32::is_available( + const SizeArgs& args) const { + return args.can_be_treated_as_int8x8x32(); +} + +void BatchedMatrixMulForwardImpl::AlgoInt8x8x32::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + auto batch_count = args.layout_a.shape[0]; + auto m = args.tensor_c.layout.shape[1], n = args.tensor_c.layout.shape[2], + k = args.tensor_a.layout.shape[param.transposeA ? 1 : 2]; + auto LDA = args.tensor_a.layout.stride[0], + LDB = args.tensor_b.layout.stride[0], + LDC = args.tensor_c.layout.stride[0]; + + auto STA = args.tensor_a.layout.stride[1], + STB = args.tensor_b.layout.stride[1], + STC = args.tensor_c.layout.stride[1]; + + int8_t* A = args.tensor_a.compatible_ptr(); + int8_t* B = args.tensor_b.compatible_ptr(); + int32_t* C = args.tensor_c.compatible_ptr(); + + auto&& handle = concrete_handle(args.opr->handle()); + exec_igemm_8x8x32(A, B, C, batch_count, m, n, k, LDA, LDB, LDC, STA, STB, + STC, param.transposeA, param.transposeB, + cuda_stream(handle)); +} + +size_t BatchedMatrixMulForwardImpl::AlgoInt8x8x32::get_workspace_in_bytes( + const SizeArgs&) const { + return 0; +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/batched_matrix_mul/int8x8x32.cu b/dnn/src/cuda/batched_matrix_mul/int8x8x32.cu new file mode 100644 index 00000000..ea6e80fc --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/int8x8x32.cu @@ -0,0 +1,362 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/int8x8x32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include +#include "./int8x8x32.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +template +__device__ __forceinline__ void Global2SharedMem::gmem2reg_cpy() { + if (tr) { + int32_t cpy_reg[SmemConfig::smem_row][SmemConfig::smem_col / 4]; + if (aligned) { + if (SmemConfig::smem_row <= check_bound_row && + SmemConfig::smem_col <= check_bound_col) { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row; ++row) { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col / 4; ++col) { + cpy_reg[row][col] = *(reinterpret_cast( + &g_ptr[row * ld_src + col * 4])); + } + } + } else { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row; ++row) { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col / 4; ++col) { + int32_t val = 0; + if (row < check_bound_row && col * 4 < check_bound_col) + val = *(reinterpret_cast( + &g_ptr[row * ld_src + col * 4])); + cpy_reg[row][col] = val; + } + } + } + } else { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row; ++row) { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col / 4; ++col) { + int32_t val = 0; + if (row < check_bound_row && col * 4 < check_bound_col) + val = (int32_t)0xff & g_ptr[row * ld_src + col * 4]; + if (row < check_bound_row && + (col * 4 + 1) < check_bound_col) + val |= (((int32_t)0xff & + g_ptr[row * ld_src + col * 4 + 1]) + << 8); + if (row < check_bound_row && + (col * 4 + 2) < check_bound_col) + val |= (((int32_t)0xff & + g_ptr[row * ld_src + col * 4 + 2]) + << 16); + if (row < check_bound_row && + (col * 4 + 3) < check_bound_col) + val |= (((int32_t)0xff & + g_ptr[row * ld_src + col * 4 + 3]) + << 24); + cpy_reg[row][col] = val; + } + } + } +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col / 4; ++col) { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row / 4; ++row) { + int32_t src0 = cpy_reg[row * 4][col], + src1 = cpy_reg[row * 4 + 1][col], + src2 = cpy_reg[row * 4 + 2][col], + src3 = cpy_reg[row * 4 + 3][col]; + reg[col * 4 + 3][row] = ((src3 >> 24 & 0xff) << 24) | + ((src2 >> 24 & 0xff) << 16) | + ((src1 >> 24 & 0xff) << 8) | + (src0 >> 24 & 0xff); + reg[col * 4 + 2][row] = ((src3 >> 16 & 0xff) << 24) | + ((src2 >> 16 & 0xff) << 16) | + ((src1 >> 16 & 0xff) << 8) | + (src0 >> 16 & 0xff); + reg[col * 4 + 1][row] = ((src3 >> 8 & 0xff) << 24) | + ((src2 >> 8 & 0xff) << 16) | + ((src1 >> 8 & 0xff) << 8) | + (src0 >> 8 & 0xff); + reg[col * 4][row] = ((src3 & 0xff) << 24) | + ((src2 & 0xff) << 16) | + ((src1 & 0xff) << 8) | (src0 & 0xff); + } + } + } else { + if (aligned) { + if (SmemConfig::smem_row <= check_bound_row && + SmemConfig::smem_col <= check_bound_col) { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col; ++col) { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row / 4; ++row) { + reg[col][row] = *(reinterpret_cast( + &g_ptr[col * ld_src + row * 4])); + } + } + } else { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col; ++col) { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row / 4; ++row) { + int32_t val = 0; + if (row * 4 < check_bound_row && col < check_bound_col) + val = *(reinterpret_cast( + &g_ptr[col * ld_src + row * 4])); + reg[col][row] = val; + } + } + } + } else { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col; ++col) { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row / 4; ++row) { + int32_t val = 0; + if (col < check_bound_col && row * 4 < check_bound_row) + val = (int32_t)0xff & g_ptr[col * ld_src + row * 4]; + if (col < check_bound_col && + (row * 4 + 1) < check_bound_row) + val |= (((int32_t)0xff & + g_ptr[col * ld_src + row * 4 + 1]) + << 8); + if (col < check_bound_col && + (row * 4 + 2) < check_bound_row) + val |= (((int32_t)0xff & + g_ptr[col * ld_src + row * 4 + 2]) + << 16); + if (col < check_bound_col && + (row * 4 + 3) < check_bound_row) + val |= (((int32_t)0xff & + g_ptr[col * ld_src + row * 4 + 3]) + << 24); + reg[col][row] = val; + } + } + } + } +} + +template +__device__ __forceinline__ void Global2SharedMem::reg2smem_cpy() { +#pragma unroll + for (int col = 0; col < SmemConfig::smem_col; ++col) { +#pragma unroll + for (int row = 0; row < SmemConfig::smem_row / 4; ++row) { + if (smem_off + row < smem_bound) + smem[smem_off + col * ld_dst + row] = reg[col][row]; + } + } +} + +template +__device__ __forceinline__ void Global2SharedMem::iter_forward() { + g_ptr += step; +} + +template +__global__ void batched_8x8x32_kern(const int8_t* a, int lda, int sta, bool tra, + const int8_t* b, int ldb, int stb, bool trb, + int32_t* c, int ldc, int stc, int m, int n, + int k) { + typedef UnrollConfig_ UnrollConfig; + typedef ThreadConfig_ ThreadConfig; + int off_batch = blockIdx.z, off_m = blockIdx.x, off_n = blockIdx.y, + off_w = threadIdx.x, off_h = threadIdx.y, + tid_x = off_m * ThreadConfig::thread_x + off_w, + tid_y = off_n * ThreadConfig::thread_y + off_h; + static int const unroll = UnrollConfig::unroll, + thread_k = UnrollConfig::thread_k, + load_m = UnrollConfig::load_m, + load_n = UnrollConfig::load_n; + + typedef SmemConfig SmemA; + typedef SmemConfig SmemB; + typedef Global2SharedMem gl2sh_type_a; + typedef Global2SharedMem gl2sh_type_b; + + extern __shared__ int32_t smem[]; + int idx_m = off_h / thread_k * load_m + tid_x * UnrollConfig::unroll_m, + idx_n = off_w / thread_k * load_n + tid_y * UnrollConfig::unroll_n, + idx_k_a = off_h % thread_k, idx_k_b = off_w % thread_k; + int off_a = tra ? (off_batch * lda + idx_m + idx_k_a * unroll * sta) + : (off_batch * lda + idx_m * sta + idx_k_a * unroll); + int off_b = trb ? (off_batch * ldb + idx_n * stb + idx_k_b * unroll) + : (off_batch * ldb + idx_n + idx_k_b * unroll * stb); + int off_c = off_batch * ldc + tid_x * UnrollConfig::unroll_m * stc + + tid_y * UnrollConfig::unroll_n; + int32_t* ptr_c = nullptr; + int32_t* smem_a = reinterpret_cast(smem); + int32_t* smem_b = reinterpret_cast( + &smem_a[(UnrollConfig::unroll_k / 4) * UnrollConfig::block_m]); + + int off_smem_a = + (off_w * UnrollConfig::unroll_m + (off_h / thread_k) * load_m) * + UnrollConfig::unroll_k / 4, + off_smem_b = + (off_h * UnrollConfig::unroll_n + (off_w / thread_k) * load_n) * + UnrollConfig::unroll_k / 4; + int a_col = load_m; + if (a_col > m - idx_m) + a_col = m - idx_m; + if (a_col < 0) { + off_a = off_batch * lda; + off_c = -1; + a_col = 0; + } + int a_row = unroll; + if (a_row > k - idx_k_a * unroll) + a_row = k - idx_k_a * unroll; + if (a_row < 0) { + off_smem_a = 0; + a_row = 0; + } + int b_col = load_n; + if (b_col > n - idx_n) { + b_col = n - idx_n; + } + if (b_col < 0) { + off_b = off_batch * ldb; + off_c = -1; + b_col = 0; + } + int b_row = unroll; + if (b_row > k - idx_k_b * unroll) + b_row = k - idx_k_b * unroll; + if (b_row < 0) { + off_smem_b = 0; + b_row = 0; + } + if (off_c != -1) + ptr_c = &c[off_c]; + int step_a = tra ? UnrollConfig::unroll_k * sta : UnrollConfig::unroll_k, + step_b = trb ? UnrollConfig::unroll_k : UnrollConfig::unroll_k * stb; + bool al_a = tra ? (m % 4 == 0) : (k % 4 == 0), + al_b = trb ? (k % 4 == 0) : (n % 4 == 0); + + gl2sh_type_a gl2sh_a(&smem_a[off_smem_a], idx_k_a * unroll / 4, + UnrollConfig::unroll_k / 4, sta, + UnrollConfig::unroll_k / 4, a_row, a_col, step_a, tra, + al_a); + gl2sh_type_b gl2sh_b(&smem_b[off_smem_b], idx_k_b * unroll / 4, + UnrollConfig::unroll_k / 4, stb, + UnrollConfig::unroll_k / 4, b_row, b_col, step_b, !trb, + al_b); + + gl2sh_a.g_ptr = &a[off_a]; + gl2sh_b.g_ptr = &b[off_b]; + + gl2sh_a.gmem2reg_cpy(); + gl2sh_b.gmem2reg_cpy(); + + int32_t sum[UnrollConfig::unroll_m * UnrollConfig::unroll_n]; +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_m; ++i) +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) + sum[i * UnrollConfig::unroll_n + j] = 0; + + for (int k_out = k; k_out > 0; k_out -= UnrollConfig::unroll_k) { + gl2sh_a.reg2smem_cpy(); + gl2sh_b.reg2smem_cpy(); + if (k_out > UnrollConfig::unroll_k) { + gl2sh_a.iter_forward(); + gl2sh_b.iter_forward(); + if (gl2sh_a.check_bound_row > + k_out - UnrollConfig::unroll_k - idx_k_a * unroll) { + gl2sh_a.check_bound_row = + k_out - UnrollConfig::unroll_k - idx_k_a * unroll; + if (gl2sh_a.check_bound_row < 0) + gl2sh_a.check_bound_row = 0; + } + if (gl2sh_b.check_bound_row > + k_out - UnrollConfig::unroll_k - idx_k_b * unroll) { + gl2sh_b.check_bound_row = + k_out - UnrollConfig::unroll_k - idx_k_b * unroll; + if (gl2sh_b.check_bound_row < 0) + gl2sh_b.check_bound_row = 0; + } + gl2sh_a.gmem2reg_cpy(); + gl2sh_b.gmem2reg_cpy(); + } + __syncthreads(); + if (off_c != -1) { + int32_t reg_a[UnrollConfig::unroll_m], + reg_b[UnrollConfig::unroll_n]; +#pragma unroll + for (int k_in = 0; + k_in < UnrollConfig::unroll_k / 4 && k_in * 4 < k_out; + ++k_in) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_m; ++i) + reg_a[i] = smem_a[(off_w * UnrollConfig::unroll_m + i) * + UnrollConfig::unroll_k / 4 + + k_in]; +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) + reg_b[j] = smem_b[(off_h * UnrollConfig::unroll_n + j) * + UnrollConfig::unroll_k / 4 + + k_in]; +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_m; ++i) +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + dot_prod(reg_a[i], reg_b[j], + sum[i * UnrollConfig::unroll_n + j], + sum[i * UnrollConfig::unroll_n + j]); + } + } + } + __syncthreads(); + } + if (off_c != -1) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_m; ++i) +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) + if (tid_x * UnrollConfig::unroll_m + i < m && + tid_y * UnrollConfig::unroll_n + j < n) + *(ptr_c + i * stc + j) = + sum[i * UnrollConfig::unroll_n + j]; + } +} + +void exec_igemm_8x8x32(const int8_t* A, const int8_t* B, int32_t* C, + const int batch_count, const int m, const int n, + const int k, int ldA, int ldB, int ldC, int stA, int stB, + int stC, bool transA, bool transB, cudaStream_t stream) { + static int const unroll_m = 8, unroll_n = 8, unroll_k = 32, unroll = 4; + typedef ThreadConfig<8, 8> Thread; + typedef UnrollConfig Unroll; + dim3 block(Thread::thread_x, Thread::thread_y); + dim3 grid; + grid.x = (m + Unroll::block_m - 1) / Unroll::block_m; + grid.y = (n + Unroll::block_n - 1) / Unroll::block_n; + grid.z = batch_count; + static uint32_t shared_storage = (Unroll::block_m + Unroll::block_n) * + Unroll::unroll_k * sizeof(int8_t); + + void (*kern)(const int8_t* a, int lda, int sta, bool tra, const int8_t* b, + int ldb, int stb, bool trb, int32_t* c, int ldc, int stc, + int m, int n, int k) = batched_8x8x32_kern; + kern<<>>( + A, ldA, stA, transA, B, ldB, stB, transB, C, ldC, stC, m, n, k); + after_kernel_launch(); +} + +} // namespace cuda +} // namespace megdnn + // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/batched_matrix_mul/int8x8x32.cuh b/dnn/src/cuda/batched_matrix_mul/int8x8x32.cuh new file mode 100644 index 00000000..945d0a78 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/int8x8x32.cuh @@ -0,0 +1,95 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/int8x8x32.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +template +struct UnrollConfig { + typedef ThreadConfig_ ThreadConfig; + static int const unroll_m = m_; + static int const unroll_n = n_; + static int const block_m = ThreadConfig::thread_x * m_; + static int const block_n = ThreadConfig::thread_y * n_; + static int const unroll_k = k_tot; + static int const unroll = k_; + static int const thread_k = k_tot / k_; + static int const load_m = + (m_ / 4) / (ThreadConfig::thread_y / thread_k) * 4; + static int const load_n = + (n_ / 4) / (ThreadConfig::thread_x / thread_k) * 4; +}; + +template +struct ThreadConfig { + static int const thread_x = x_; + static int const thread_y = y_; +}; + +template +struct SmemConfig { + static int const smem_row = row; + static int const smem_col = col; +}; + +template +struct Global2SharedMem { + typedef SmemConfig_ SmemConfig; + const int8_t* g_ptr; + int32_t* smem; + int smem_off; + int smem_bound; + int32_t reg[SmemConfig::smem_col][SmemConfig::smem_row / 4]; + int ld_src; + int ld_dst; + int check_bound_row; + int check_bound_col; + int step; + bool tr; + bool aligned; + + __device__ __forceinline__ Global2SharedMem(int32_t* smem_, int s_off, + int s_bound, int ld_src_, + int ld_dst_, int b_r_, int b_c_, + int step_, bool tr_, bool al_) + : smem(smem_), + smem_off(s_off), + smem_bound(s_bound), + ld_src(ld_src_), + ld_dst(ld_dst_), + check_bound_row(b_r_), + check_bound_col(b_c_), + step(step_), + tr(tr_), + aligned(al_) {} + + __device__ __forceinline__ void gmem2reg_cpy(); + __device__ __forceinline__ void reg2smem_cpy(); + __device__ __forceinline__ void iter_forward(); +}; + +template +__global__ void batched_8x8x32_kern(const int8_t* a, int lda, int sta, bool tra, + const int8_t* b, int ldb, int stb, bool trb, + int32_t* c, int ldc, int stc, int m, int n, + int k); + +void exec_igemm_8x8x32(const int8_t* A, const int8_t* B, int32_t* C, + const int batch_count, const int m, const int n, + const int k, int ldA, int ldB, int ldC, int stA, int stB, + int stC, bool transA, bool transB, cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp b/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp new file mode 100644 index 00000000..37d7dca7 --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp @@ -0,0 +1,96 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/batched_matrix_mul/opr_impl.h" +#include "src/cuda/batched_matrix_mul/algo.h" +#include "src/cuda/batched_matrix_mul/helper.cuh" + +#include "src/common/algo_chooser.h" +#include "src/common/utils.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +using Algorithm = BatchedMatrixMulForwardImpl::Algorithm; + +void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, + _megdnn_tensor_out C, + _megdnn_workspace workspace) { + using namespace batched_matrix_mul; + //! + //! \Note (int8, int8) => int32 is supported + //! auto dtype=A.layout.dtype; + //! megdnn_assert(dtype.category() == DTypeCategory::FLOAT); + AlgoBase::ExecArgs args(this, A, B, C, workspace); + check_exec(A.layout, B.layout, C.layout, workspace.size); + auto&& algo = megdnn::get_algorithm(this, A.layout, B.layout, C.layout); + algo->check_workspace(args, workspace).exec(args); +} + +size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( + const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { + AlgoBase::SizeArgs args(this, A, B, C); + return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); +} + +std::vector BatchedMatrixMulForwardImpl::get_all_algorithms( + const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { + std::vector ret; + AlgoBase::SizeArgs args(this, A, B, C); + for (auto&& algo : sm_algo_pack.all_algos) { + if (algo->is_available(args)) + ret.push_back(algo); + } + return ret; +} + +Algorithm* BatchedMatrixMulForwardImpl::get_algorithm_heuristic( + const TensorLayout& A, const TensorLayout& B, const TensorLayout& C, + size_t workspace_limit_in_bytes, bool reproducible) { + AlgoBase::SizeArgs args(this, A, B, C); + std::vector brute_force_algos; + + if (sm_algo_pack.cublas.is_available_reproducible(args, reproducible)) { + return &sm_algo_pack.cublas; + } +#if CUDA_VERSION >= 10010 + else if (sm_algo_pack.cublasLt.is_available_reproducible(args, + reproducible)) { + return &sm_algo_pack.cublasLt; + } +#endif + else if (sm_algo_pack.int8x8x32.is_available_reproducible(args, + reproducible)) { + return &sm_algo_pack.int8x8x32; + } else { + for (auto& algo : sm_algo_pack.brute_force_algos) { + if (algo.is_available_reproducible(args, reproducible)) { + return &algo; + } + } + } + + for (auto& algo : sm_algo_pack.brute_force_algos) + brute_force_algos.push_back(&algo); + + if (reproducible) { + return megdnn::get_reproducible_algo( + brute_force_algos, args, workspace_limit_in_bytes, + "batched matrix mul"); + } else { + return megdnn::get_usable_algo( + brute_force_algos, args, workspace_limit_in_bytes, + "batched matrix mul"); + } +}; + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batched_matrix_mul/opr_impl.h b/dnn/src/cuda/batched_matrix_mul/opr_impl.h new file mode 100644 index 00000000..c38da62b --- /dev/null +++ b/dnn/src/cuda/batched_matrix_mul/opr_impl.h @@ -0,0 +1,57 @@ +/** + * \file dnn/src/cuda/batched_matrix_mul/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" +#include "src/cuda/matrix_mul/cublasLt_wrapper.h" +namespace megdnn { +namespace cuda { + +class BatchedMatrixMulForwardImpl : public BatchedMatrixMulForward { +public: + using BatchedMatrixMulForward::BatchedMatrixMulForward; + BatchedMatrixMulForwardImpl(Handle* handle) : BatchedMatrixMul(handle) {} + + class AlgoBase; + class AlgoBruteForce; + class AlgoCublas; +#if CUDA_VERSION >= 10010 + class AlgoCublasLt; +#endif + class AlgoInt8x8x32; + class AlgoPack; + + void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C) override; + std::vector get_all_algorithms(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C, + size_t workspace_limit_in_bytes, + bool reproducible) override; + const char* get_algorithm_set_name() const override { + return "BATCHED_MATMUL"; + } + + bool is_thread_safe() const override { return true; } + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/checksum/kern.cu b/dnn/src/cuda/checksum/kern.cu new file mode 100644 index 00000000..e8c04bb0 --- /dev/null +++ b/dnn/src/cuda/checksum/kern.cu @@ -0,0 +1,77 @@ +/** + * \file dnn/src/cuda/checksum/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kern.cuh" + +#include "src/cuda/utils.cuh" +#include "src/cuda/reduce_helper.cuh" + +namespace { + struct ChecksumOp { + typedef uint32_t wtype; + const uint32_t *src; + uint32_t *dst; + + static const uint32_t INIT = 0; + + __host__ __device__ void write(uint32_t idx, uint32_t val) { + dst[idx] = val; + } + + __host__ __device__ static uint32_t apply(uint32_t a, uint32_t b) { + return a + b; + } + }; + + struct NonFourAlignedChecksumOp : ChecksumOp { + __host__ __device__ uint32_t read(uint32_t idx) { + uint8_t* data = (uint8_t*) (src + idx); + return (data[0] | ((uint32_t) data[1] << 8) | + ((uint32_t) data[2] << 16) | ((uint32_t) data[3] << 24)) * + (idx + 1); + } + }; + + struct FourAlignedChecksumOp : ChecksumOp { + __host__ __device__ uint32_t read(uint32_t idx) { + return src[idx] * (idx + 1); + } + }; + + +} // anonymous namespace + +void megdnn::cuda::checksum::calc( + uint32_t *dest, + const uint32_t *buf, + uint32_t *workspace, + size_t nr_elem, cudaStream_t stream) { + if (!nr_elem) + return; + if (reinterpret_cast(buf) & 0b11) { + NonFourAlignedChecksumOp op; + op.src = buf; + op.dst = dest; + run_reduce(workspace, + 1, nr_elem, 1, stream, op); + } else { + FourAlignedChecksumOp op; + op.src = buf; + op.dst = dest; + run_reduce(workspace, + 1, nr_elem, 1, stream, op); + } +} + +size_t megdnn::cuda::checksum::get_workspace_in_bytes(size_t nr_elem) +{ + return get_reduce_workspace_in_bytes(1, nr_elem, 1); +} +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/checksum/kern.cuh b/dnn/src/cuda/checksum/kern.cuh new file mode 100644 index 00000000..4f6bb964 --- /dev/null +++ b/dnn/src/cuda/checksum/kern.cuh @@ -0,0 +1,32 @@ +/** + * \file dnn/src/cuda/checksum/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/utils.cuh" + +namespace megdnn{ +namespace cuda { +namespace checksum { + +void calc( + uint32_t *dest, const uint32_t *buf, uint32_t *workspace, + size_t nr_elem, + cudaStream_t stream); + +size_t get_workspace_in_bytes(size_t nr_elem); + +} +} +} + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/checksum/opr_impl.cpp b/dnn/src/cuda/checksum/opr_impl.cpp new file mode 100644 index 00000000..25daea31 --- /dev/null +++ b/dnn/src/cuda/checksum/opr_impl.cpp @@ -0,0 +1,70 @@ +/** + * \file dnn/src/cuda/checksum/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./opr_impl.h" + +#include "src/cuda/reduce_helper.cuh" +#include "src/common/utils.h" + +#include + +using namespace megdnn; +using namespace cuda; + +namespace { + +WorkspaceBundle get_wbundle(const TensorLayout &data) +{ + size_t size_all = data.shape[0], + size_ints = size_all / sizeof(uint32_t); + size_t part1 = checksum::get_workspace_in_bytes(size_ints); + size_t part2 = sizeof(ChecksumForward::Result::checksum); + return {nullptr, {part1, part2}}; +} + +} // anonymous namespace + +size_t ChecksumForwardImpl::get_workspace_in_bytes(const TensorLayout &data) { + auto wbundle = get_wbundle(data); + return wbundle.total_size_in_bytes(); +} + + +ChecksumForward::Result ChecksumForwardImpl::exec( + _megdnn_tensor_in data, _megdnn_workspace workspace) { + auto wbundle = get_wbundle(data.layout); + wbundle.set(workspace.raw_ptr); + Result result; + memset(&result, 0, sizeof(result)); + check_exec(data.layout, workspace.size); + auto stream = cuda_stream(handle()); + + auto ptr = static_cast(data.raw_ptr); + size_t size_all = data.layout.shape[0], + size_ints = size_all / sizeof(uint32_t); + auto last_val_size = std::min(size_all, 4); + cuda_check(cudaMemcpyAsync( + &result.last_val, ptr + size_all - last_val_size, last_val_size, + cudaMemcpyDeviceToHost, stream)); + if (size_ints) { + checksum::calc(static_cast(wbundle.get(1)), + static_cast(data.raw_ptr), + static_cast(wbundle.get(0)), + size_ints, stream); + cuda_check(cudaMemcpyAsync(&result.checksum, wbundle.get(1), + sizeof(result.checksum), cudaMemcpyDeviceToHost, stream)); + } + cuda_check(cudaStreamSynchronize(stream)); + return result; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/checksum/opr_impl.h b/dnn/src/cuda/checksum/opr_impl.h new file mode 100644 index 00000000..d5e5ef5e --- /dev/null +++ b/dnn/src/cuda/checksum/opr_impl.h @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/checksum/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +class ChecksumForwardImpl final: public ChecksumForward { + public: + using ChecksumForward::ChecksumForward; + + size_t get_workspace_in_bytes(const TensorLayout &) override; + + bool is_thread_safe() const override { + return true; + } + + Result exec(_megdnn_tensor_in data, _megdnn_workspace workspace) + override; +}; + +} +} + +// vim: syntax=cpp.doxygen + + diff --git a/dnn/src/cuda/concat/concat.cu b/dnn/src/cuda/concat/concat.cu new file mode 100644 index 00000000..40043eca --- /dev/null +++ b/dnn/src/cuda/concat/concat.cu @@ -0,0 +1,77 @@ +/** + * \file dnn/src/cuda/concat/concat.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/concat/concat.cuh" + +#include "src/cuda/utils.cuh" +#include "megdnn/dtype.h" + +namespace megdnn { +namespace cuda { +namespace concat { + +template +__global__ void forward_kernel(const T **srcs, T *dst, + size_t nr_srcs, + size_t A, size_t B, size_t C, + const size_t *Bv, + const size_t *table_outer, + const size_t *table_inner) +{ + size_t addr = threadIdx.x + blockIdx.x * blockDim.x; + if (addr < A*B*C) { + size_t c = addr % C; + size_t b = addr / C % B; + size_t a = addr / (B*C); + size_t i = table_outer[b]; + size_t B_src = Bv[i]; + size_t b_src = table_inner[b]; + size_t addr_src = (a*B_src + b_src)*C + c; + dst[addr] = srcs[i][addr_src]; + } +} + +template +void forward_proxy(const T **srcs, + T *dst, + size_t nr_srcs, + size_t A, size_t B, size_t C, + const size_t *Bv, + const size_t *table_outer, + const size_t *table_inner, + cudaStream_t stream) +{ + size_t total_nr_elem = A * B * C; + size_t NR_BLOCKS = DIVUP(total_nr_elem, NR_THREADS); + forward_kernel<<>>(srcs, dst, + nr_srcs, + A, B, C, + Bv, + table_outer, + table_inner); + after_kernel_launch(); +} + +#define INST(T) \ +template void forward_proxy(const T**, T *, size_t, size_t, size_t, size_t, \ + const size_t *, const size_t *, const size_t *, cudaStream_t); +#define cb(DType) INST(typename DTypeTrait::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +#undef cb +#undef INST + +} // namespace concat +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/concat/concat.cuh b/dnn/src/cuda/concat/concat.cuh new file mode 100644 index 00000000..c0ce9830 --- /dev/null +++ b/dnn/src/cuda/concat/concat.cuh @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/concat/concat.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include + +namespace megdnn { +namespace cuda { +namespace concat { + +template +void forward_proxy(const T **srcs, + T *dst, + size_t nr_srcs, + size_t A, size_t B, size_t C, + const size_t *Bv, + const size_t *table_outer, + const size_t *table_inner, + cudaStream_t stream); + +} // namespace concat +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/concat/opr_impl.cpp b/dnn/src/cuda/concat/opr_impl.cpp new file mode 100644 index 00000000..188024b3 --- /dev/null +++ b/dnn/src/cuda/concat/opr_impl.cpp @@ -0,0 +1,146 @@ +/** + * \file dnn/src/cuda/concat/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/concat/opr_impl.h" +#include "src/cuda/utils.h" +#include "src/cuda/concat/concat.cuh" + +namespace megdnn { +namespace cuda { + +size_t ConcatForwardImpl::get_workspace_in_bytes( + const TensorLayoutArray &srcs, + const TensorLayout &dst) +{ + auto B = dst.shape[param().axis]; + // Please refer to the comment in ConcatForwardImpl::exec for detail. + WorkspaceBundle bundle(nullptr, { + sizeof(uintptr_t) * srcs.size(), + sizeof(size_t) * srcs.size(), + sizeof(size_t) * B, + sizeof(size_t) * B}); + return bundle.total_size_in_bytes(); +} + +template +void ConcatForwardImpl::exec_internal( + _megdnn_in const TensorNDArray &srcs, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + auto srcs_layout = apply_vector(m_get_layout, srcs); + auto srcs_shape = apply_vector(m_get_shape, srcs_layout); + check_exec(srcs_layout, dst.layout, workspace.size); + size_t A, B, C; + auto stream = cuda_stream(this->handle()); + + // Pre-calculate B to determine cpu-side workspace size. + B = dst.layout.shape[param().axis]; + + // workspace_cpu will be freed by cuda callback. + SmallVector workspace_sizes{ + sizeof(const T *) * srcs.size(), + sizeof(size_t) * srcs.size(), + sizeof(size_t) * B, + sizeof(size_t) * B, + }; + + // What do we need: + // 1. An const T * array of length src.size(), the i-th element of + // which stores the address of the i-th srcs. + // 2. A size_t array of length srcs.size(), the i-th element of which + // stores the shape of the param().axis-th axis of the i-th src. + // 3. A size_t array of length B, the i-th element of which stores the + // index of the src tensor that the i-th element along the + // param().axis-th axis of dst belongs to. + // 4. A size_t array of length B, the i-th element of which stores the + // intra-offset inside the corresponding src tensor of the i-th element + // along the param().axis-th axis of dst. + // + // These temporary spaces reside in the device side. + // The actually work is delegated to concat::forward_proxy. + WorkspaceBundle workspace_cpu(nullptr, workspace_sizes), + workspace_gpu(nullptr, workspace_sizes); + auto total_workspace_size = workspace_cpu.total_size_in_bytes(); + void *workspace_cpu_raw = malloc(total_workspace_size); + megdnn_assert_internal(workspace_cpu_raw); + void *workspace_gpu_raw = workspace.raw_ptr; + workspace_cpu = WorkspaceBundle(workspace_cpu_raw, workspace_sizes); + workspace_gpu = WorkspaceBundle(workspace_gpu_raw, workspace_sizes); + // srcs + auto srcs_cpu = static_cast(workspace_cpu.get(0)); + auto srcs_gpu = static_cast(workspace_gpu.get(0)); + for (size_t i = 0; i < srcs.size(); ++i) { + srcs_cpu[i] = srcs[i].ptr(); + } + + // Bv + auto Bv_cpu = static_cast(workspace_cpu.get(1)); + auto Bv_gpu = static_cast(workspace_gpu.get(1)); + get_ABC(srcs_shape, A, Bv_cpu, C); + + // table_outer + auto table_outer_cpu = static_cast(workspace_cpu.get(2)); + auto table_outer_gpu = static_cast(workspace_gpu.get(2)); + auto table_inner_cpu = static_cast(workspace_cpu.get(3)); + auto table_inner_gpu = static_cast(workspace_gpu.get(3)); + { + size_t outer_idx = 0, inner_idx = 0; + + for (size_t i = 0; i < B; ++i) { + table_outer_cpu[i] = outer_idx; + table_inner_cpu[i] = inner_idx; + ++inner_idx; + if (inner_idx == Bv_cpu[outer_idx]) { + ++outer_idx; + inner_idx = 0; + } + } + } + for (size_t i = 0; i < workspace_cpu.nr_workspace(); ++i) { + cuda_check(cudaMemcpyAsync(workspace_gpu.get(i), + workspace_cpu.get(i), + workspace_cpu.get_size(i), + cudaMemcpyHostToDevice, + stream)); + } + /* + CUDA_CK(cudaMemcpyAsync(workspace_gpu_raw, workspace_cpu_raw, + workspace_cpu.total_size_in_bytes(), + cudaMemcpyHostToDevice, + stream)); + */ + cuda_check(cudaStreamAddCallback(stream, callback_free, + static_cast(workspace_cpu_raw), 0)); + concat::forward_proxy(srcs_gpu, dst.ptr(), srcs.size(), + A, B, C, + Bv_gpu, + table_outer_gpu, + table_inner_gpu, + stream); +} + +void ConcatForwardImpl::exec(_megdnn_in const TensorNDArray &srcs, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_internal(srcs, dst, workspace); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/concat/opr_impl.h b/dnn/src/cuda/concat/opr_impl.h new file mode 100644 index 00000000..3625c51e --- /dev/null +++ b/dnn/src/cuda/concat/opr_impl.h @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/concat/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class ConcatForwardImpl: public ConcatForward { + public: + using ConcatForward::ConcatForward; + void exec(_megdnn_in const TensorNDArray &srcs, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayoutArray &, + const TensorLayout &) override; + private: + template + void exec_internal(_megdnn_in const TensorNDArray &srcs, + _megdnn_tensor_out dst, + _megdnn_workspace workspace); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/cond_take/kern.cu b/dnn/src/cuda/cond_take/kern.cu new file mode 100644 index 00000000..159e334f --- /dev/null +++ b/dnn/src/cuda/cond_take/kern.cu @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/cond_take/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "src/cuda/cumsum/kern_impl.cuinl" +#include "src/cuda/query_blocksize.cuh" +#include "src/common/cond_take/predicate.cuh" +#include + +using namespace megdnn; +using namespace megdnn::cond_take; +using namespace megdnn::cuda::cond_take; + +size_t cuda::cond_take::gen_idx_get_workspace_size(size_t size) { + megdnn_assert(size < std::numeric_limits::max()); + return cumsum::get_workspace_in_bytes(1, size, 1, sizeof(IdxType)); +} + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/cond_take/kern.cuh b/dnn/src/cuda/cond_take/kern.cuh new file mode 100644 index 00000000..3cbc251b --- /dev/null +++ b/dnn/src/cuda/cond_take/kern.cuh @@ -0,0 +1,56 @@ +/** + * \file dnn/src/cuda/cond_take/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/dtype.h" +#include "src/common/cond_take/predicate.cuh" +#include + +namespace megdnn { +namespace cuda { +namespace cond_take { + +typedef dt_int32 IdxType; + +/*! + * \brief generate indices to take according to mask + * \param dest_idx output index, must be size+1 long + * \param size number of elements in mask + * \return output size; i.e. number of elements taken + */ +template +size_t gen_idx( + void *workspace, size_t workspace_size, + IdxType *dest_idx, const T *mask, size_t size, + uint32_t mode, const megdnn::cond_take::KParam &kparam, + cudaStream_t stream); + +//! get workspace size in bytes for gen_idx() +size_t gen_idx_get_workspace_size(size_t size); + +/*! + * \brief copy to final output + * \param[out] dest_data data output, size is returned by gen_idx() + * \param[out] dest_idx index output, size is returned by gen_idx() + * \param src_data data input + * \param src_idx index input, must have been filled by gen_idx() + * \param size size of original mask + */ +template +void copy_output(T *dest_data, IdxType *dest_idx, + const T *src_data, IdxType *src_idx, uint32_t size, + cudaStream_t stream); + +} // namespace cond_take +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/cond_take/kern.inl b/dnn/src/cuda/cond_take/kern.inl new file mode 100644 index 00000000..75f55de8 --- /dev/null +++ b/dnn/src/cuda/cond_take/kern.inl @@ -0,0 +1,131 @@ +/** + * \file dnn/src/cuda/cond_take/kern.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "src/cuda/cumsum/kern_impl.cuinl" +#include "src/cuda/query_blocksize.cuh" +#include "src/common/cond_take/predicate.cuh" +#include + +using namespace megdnn; +using namespace megdnn::cond_take; +using namespace megdnn::cuda::cond_take; + +namespace { + + //! cumsum opr to get output index + template + struct IdxGetter { + typedef ::megdnn::cuda::cumsum::SumOp ContigOp; + + const T * data; + Pred pred; + + IdxGetter(const T *d, const ::megdnn::cond_take::KParam &p): + data(d), pred(p) + {} + + __host__ __device__ static IdxType init() { + return 0; + } + + __device__ static IdxType apply(IdxType lhs, IdxType rhs) { + return lhs + rhs; + } + + __device__ IdxType visit(uint32_t idx) const { + return pred(data[idx]); + } + + static ContigOp make_contig(const IdxType *data) { + return ContigOp(data); + } + }; + + template + __global__ void copy_kern( + T *dest_data, IdxType *dest_idx, + const T *src_data, const IdxType *src_idx, uint32_t size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < size && src_idx[tid] > src_idx[tid - 1]) { + uint32_t v = src_idx[tid] - 1; + dest_data[v] = src_data[tid]; + dest_idx[v] = tid; + } + } + + // set zero for the first element + __global__ void set_zero(IdxType *dest) { + dest[0] = 0; + } + +} // anonymous namespace + +template +size_t cuda::cond_take::gen_idx( + void *workspace, size_t workspace_size, + IdxType *dest_idx, const T *mask, size_t size, + uint32_t mode, const KParam &kparam, cudaStream_t stream) { + + switch (mode) { +#define cb(_m) case PEnum::_m: \ + { \ + typedef IdxGetter Op; \ + cuda::cumsum::run_kern( \ + dest_idx + 1, workspace, workspace_size, \ + 1, size, 1, Op(mask, kparam), stream); \ + break; \ + } + MEGDNN_FOREACH_COND_TAKE_MODE(cb) +#undef cb + default: + megdnn_trap(); + } + + IdxType host_sum_size; + cuda_check(cudaMemcpyAsync(&host_sum_size, dest_idx + size, sizeof(IdxType), + cudaMemcpyDeviceToHost, stream)); + cuda_check(cudaStreamSynchronize(stream)); + return host_sum_size; +} + +template +void cuda::cond_take::copy_output(T *dest_data, IdxType *dest_idx, + const T *src_data, IdxType *src_idx, uint32_t size, + cudaStream_t stream) { + int nr_thread = query_blocksize_for_kernel(copy_kern); + int nr_block = DIVUP(size, nr_thread); + set_zero <<< 1, 1, 0, stream >>> (src_idx); + copy_kern <<< nr_block, nr_thread, 0, stream >>> ( + dest_data, dest_idx, src_data, src_idx + 1, size); + after_kernel_launch(); +} + +namespace megdnn { +namespace cuda { +namespace cond_take { + +#define inst_genidx(dt) \ + template size_t gen_idx( \ + void*, size_t, IdxType*, const DTypeTrait
::ctype*, \ + size_t, uint32_t, const KParam &, cudaStream_t); + +#define inst_copy_(ct) \ + template void copy_output(ct*, IdxType*, const ct*, \ + IdxType*, uint32_t, cudaStream_t); +#define inst_copy(dt) inst_copy_(DTypeTrait
::ctype) + +} // namespace cond_take +} // namespace cuda +} // namespace megdnn + + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/cond_take/kimpl/dt_float16.cu b/dnn/src/cuda/cond_take/kimpl/dt_float16.cu new file mode 100644 index 00000000..d8da9cab --- /dev/null +++ b/dnn/src/cuda/cond_take/kimpl/dt_float16.cu @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/cond_take/kimpl/dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cond_take_kern_impls.py +#include "../kern.inl" + +#if !MEGDNN_DISABLE_FLOAT16 +namespace megdnn { +namespace cuda { +namespace cond_take { + +inst_genidx(::megdnn::dtype::Float16) +#undef inst_genidx + +inst_copy(::megdnn::dtype::Float16) +#undef inst_copy +#undef inst_copy_ + +} // cond_take +} // cuda +} // megdnn +#endif diff --git a/dnn/src/cuda/cond_take/kimpl/dt_float32.cu b/dnn/src/cuda/cond_take/kimpl/dt_float32.cu new file mode 100644 index 00000000..991e650d --- /dev/null +++ b/dnn/src/cuda/cond_take/kimpl/dt_float32.cu @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/cond_take/kimpl/dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cond_take_kern_impls.py +#include "../kern.inl" + +namespace megdnn { +namespace cuda { +namespace cond_take { + +inst_genidx(::megdnn::dtype::Float32) +#undef inst_genidx + +inst_copy(::megdnn::dtype::Float32) +#undef inst_copy +#undef inst_copy_ + +} // cond_take +} // cuda +} // megdnn diff --git a/dnn/src/cuda/cond_take/kimpl/dt_int16.cu b/dnn/src/cuda/cond_take/kimpl/dt_int16.cu new file mode 100644 index 00000000..8836c0e3 --- /dev/null +++ b/dnn/src/cuda/cond_take/kimpl/dt_int16.cu @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/cond_take/kimpl/dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cond_take_kern_impls.py +#include "../kern.inl" + +namespace megdnn { +namespace cuda { +namespace cond_take { + +inst_genidx(::megdnn::dtype::Int16) +#undef inst_genidx + +inst_copy(::megdnn::dtype::Int16) +#undef inst_copy +#undef inst_copy_ + +} // cond_take +} // cuda +} // megdnn diff --git a/dnn/src/cuda/cond_take/kimpl/dt_int32.cu b/dnn/src/cuda/cond_take/kimpl/dt_int32.cu new file mode 100644 index 00000000..86f58156 --- /dev/null +++ b/dnn/src/cuda/cond_take/kimpl/dt_int32.cu @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/cond_take/kimpl/dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cond_take_kern_impls.py +#include "../kern.inl" + +namespace megdnn { +namespace cuda { +namespace cond_take { + +inst_genidx(::megdnn::dtype::Int32) +#undef inst_genidx + +inst_copy(::megdnn::dtype::Int32) +#undef inst_copy +#undef inst_copy_ + +} // cond_take +} // cuda +} // megdnn diff --git a/dnn/src/cuda/cond_take/kimpl/dt_int8.cu b/dnn/src/cuda/cond_take/kimpl/dt_int8.cu new file mode 100644 index 00000000..8e0d6ad5 --- /dev/null +++ b/dnn/src/cuda/cond_take/kimpl/dt_int8.cu @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/cond_take/kimpl/dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cond_take_kern_impls.py +#include "../kern.inl" + +namespace megdnn { +namespace cuda { +namespace cond_take { + +inst_genidx(::megdnn::dtype::Int8) +#undef inst_genidx + +inst_copy(::megdnn::dtype::Int8) +#undef inst_copy +#undef inst_copy_ + +} // cond_take +} // cuda +} // megdnn diff --git a/dnn/src/cuda/cond_take/kimpl/dt_uint8.cu b/dnn/src/cuda/cond_take/kimpl/dt_uint8.cu new file mode 100644 index 00000000..08e3ff8c --- /dev/null +++ b/dnn/src/cuda/cond_take/kimpl/dt_uint8.cu @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/cond_take/kimpl/dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cond_take_kern_impls.py +#include "../kern.inl" + +namespace megdnn { +namespace cuda { +namespace cond_take { + +inst_genidx(::megdnn::dtype::Uint8) +#undef inst_genidx + +inst_copy(::megdnn::dtype::Uint8) +#undef inst_copy +#undef inst_copy_ + +} // cond_take +} // cuda +} // megdnn diff --git a/dnn/src/cuda/cond_take/opr_impl.cpp b/dnn/src/cuda/cond_take/opr_impl.cpp new file mode 100644 index 00000000..4e5191c7 --- /dev/null +++ b/dnn/src/cuda/cond_take/opr_impl.cpp @@ -0,0 +1,92 @@ +/** + * \file dnn/src/cuda/cond_take/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./kern.cuh" +#include "src/common/utils.h" +#include "src/common/cond_take/predicate.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace cuda::cond_take; +using namespace megdnn::cond_take; + +using Param = CondTake::Param; + +WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) { + cuda_check(cudaSetDevice(concrete_handle(handle())->device_id())); + auto gen_idx_wk_size = gen_idx_get_workspace_size(nr_item); + return {nullptr, + {(nr_item + 1) * sizeof(IdxType), gen_idx_wk_size}, + handle()->alignment_requirement()}; +} + +size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { + return make_bundle(data.total_nr_elems()).total_size_in_bytes(); +} + +CondTakeImpl::Output CondTakeImpl::exec( + _megdnn_tensor_in data, _megdnn_tensor_in mask, + _megdnn_workspace workspace, + DynOutMallocPolicyCall malloc_policy) { + size_t size = check_exec_get_size(data.layout, mask.layout, workspace.size); + auto wk_bundle = make_bundle(size); + wk_bundle.set(workspace.raw_ptr); + + auto idx_tmp = static_cast(wk_bundle.get(0)); + + KParam kparam(param()); + auto stream = cuda_stream(handle()); + size_t out_size; + switch (mask.layout.dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + out_size = gen_idx(wk_bundle.get(1), wk_bundle.get_size(1), \ + idx_tmp, mask.ptr(), \ + size, static_cast(param().mode), kparam, \ + stream); \ + break; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + default: + megdnn_throw("bad mask dtype"); + } + + auto out_data = malloc_policy.alloc_output(0, + data.layout.dtype, {out_size}); + auto out_idx = malloc_policy.alloc_output(1, dtype::Int32(), {out_size}); + auto out_idx_ptr = out_idx.ptr(); + + switch (data.layout.dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + auto out_data_ptr = out_data.ptr(); \ + auto data_ptr = data.ptr(); \ + copy_output( \ + out_data_ptr, out_idx_ptr, data_ptr, idx_tmp, size, \ + stream); \ + break; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + default: + megdnn_throw("bad data dtype"); + } + + return {{out_data, out_idx}}; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cond_take/opr_impl.h b/dnn/src/cuda/cond_take/opr_impl.h new file mode 100644 index 00000000..1dcdad20 --- /dev/null +++ b/dnn/src/cuda/cond_take/opr_impl.h @@ -0,0 +1,35 @@ +/** + * \file dnn/src/cuda/cond_take/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs/general.h" +#include "src/common/utils.h" + +namespace megdnn { +namespace cuda { + +class CondTakeImpl final: public CondTake { + WorkspaceBundle make_bundle(size_t nr_item); + + public: + using CondTake::CondTake; + Output exec( + _megdnn_tensor_in data, _megdnn_tensor_in mask, + _megdnn_workspace workspace, + DynOutMallocPolicyCall malloc_policy) override; + + size_t get_workspace_in_bytes(const TensorLayout& data) override; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/1x1.cpp b/dnn/src/cuda/conv_bias/1x1.cpp new file mode 100644 index 00000000..0dab0071 --- /dev/null +++ b/dnn/src/cuda/conv_bias/1x1.cpp @@ -0,0 +1,113 @@ +/** + * \file dnn/src/cuda/conv_bias/1x1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::Algo1x1::is_available(const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + auto&& fm = args.filter_meta; + return fm.format == Param::Format::NCHW && + (fm.dtype.enumv() == DTypeEnum::Float32 || + fm.dtype.enumv() == DTypeEnum::Float16) && + fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 && + fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 && + fm.stride[1] == 1; +} + +void ConvBiasForwardImpl::Algo1x1::extract_matmul_layouts(const SizeArgs& args, + TensorLayout& A, + TensorLayout& B, + TensorLayout& C) { + auto&& fm = args.filter_meta; + A = {{fm.ocpg, fm.icpg}, fm.dtype}; + B.ndim = 2; + B.shape[0] = args.src_layout->shape[1]; + B.shape[1] = args.src_layout->shape[2] * args.src_layout->shape[3]; + B.stride[0] = args.src_layout->stride[1]; + B.stride[1] = 1; + B.dtype = args.src_layout->dtype; + C = {{args.dst_layout->shape[1], B.shape[1]}, args.dst_layout->dtype}; +} + +WorkspaceBundle ConvBiasForwardImpl::Algo1x1::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + SmallVector sizes; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + TensorLayout A, B, C; + extract_matmul_layouts(conv_args, A, B, C); + sizes.insert(sizes.begin(), + args.handle->matmul_opr()->get_workspace_in_bytes(A, B, C)); + return {ptr, std::move(sizes)}; +} + +size_t ConvBiasForwardImpl::Algo1x1::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::Algo1x1::exec(const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + ExecArgs conv_args = args; + conv_args.dst_tensor = &conv_dst_tensor; + conv_args.dst_layout = &conv_dst_tensor.layout; + { + TensorND A, B, C; + extract_matmul_layouts(conv_args, A.layout, B.layout, C.layout); + A.raw_ptr = conv_args.filter_tensor->raw_ptr; + B.raw_ptr = conv_args.src_tensor->raw_ptr; + C.raw_ptr = conv_args.dst_tensor->raw_ptr; + size_t batch = conv_args.src_layout->shape[0]; + auto mm = conv_args.handle->matmul_opr(); + auto strd_B = conv_args.src_layout->stride[0] * + conv_args.src_layout->dtype.size(), + strd_C = conv_args.dst_layout->stride[0] * + conv_args.dst_layout->dtype.size(); + for (size_t i = 0; i < batch; ++i) { + mm->exec(A, B, C, bundle.get_workspace(0)); + incr_voidp(B.raw_ptr, strd_B); + incr_voidp(C.raw_ptr, strd_C); + } + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp new file mode 100644 index 00000000..fa015d33 --- /dev/null +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -0,0 +1,256 @@ +/** + * \file dnn/src/cuda/conv_bias/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +ConvBiasForwardImpl::AlgoPack::AlgoPack() { + non_cudnn_algos.push_back(&chanwise); + non_cudnn_algos.push_back(&chanwise_small); + + non_cudnn_algos.push_back(&inplace_matmul); + non_cudnn_algos.push_back(&matmul); + non_cudnn_algos.push_back(&matmul8x8x32); + non_cudnn_algos.push_back(&batched_matmul); + non_cudnn_algos.push_back(&a1x1); + + fill_cudnn_algos(); + for (auto&& algo : cudnn_conv_bias_activations) { + all_algos.push_back(&algo); + } + + //! add conv+nonlinear algos + std::vector conv_algos; + conv_algos.push_back(&chanwise); + conv_algos.push_back(&chanwise_small); + conv_algos.push_back(&chanwise8x8x32); + for (auto&& algo : cudnn_convs) { + conv_algos.push_back(&algo); + } + conv_algos.push_back(&inplace_matmul); + conv_algos.push_back(&matmul); + conv_algos.push_back(&matmul8x8x32); + conv_algos.push_back(&batched_matmul); + conv_algos.push_back(&a1x1); + + conv_algos.reserve(conv_algos.size() * 2); + //! add gconv algos by AlgoGroupConvGeneral + size_t algo_size = conv_algos.size(); + for (size_t i = 3; i < algo_size; ++ i) { + gconv_refhold.emplace_back(new AlgoGroupConvGeneral(conv_algos[i])); + algo2gconv[conv_algos[i]] = gconv_refhold.back().get(); + conv_algos.push_back(gconv_refhold.back().get()); + } + + for (auto&& algo : conv_algos) { + all_algos.push_back(algo); + } + non_cudnn_algos.push_back(all_algos.rbegin()[4]); // group inplace_matmul + non_cudnn_algos.push_back(all_algos.rbegin()[3]); // group matmul + non_cudnn_algos.push_back(all_algos.rbegin()[2]); // group matmul_8x8x32 + non_cudnn_algos.push_back(all_algos.rbegin()[1]); // group batched_matmul + non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group 1x1 + + size_t all_algo_size = all_algos.size(); +#if CUDA_VERSION >= 10000 + fill_imma_algos(); + all_algos.push_back(&wmma_quint4x4x32); + for (auto&& algo : int8_nchw4_imma) { + all_algos.push_back(&algo); + } + for (auto&& algo : int8_chwn4_imma) { + all_algos.push_back(&algo); + } + for (auto&& algo : int8_chwn4_imma_reorder_filter) { + all_algos.push_back(&algo); + } + for (auto&& algo : int8_chwn4_imma_unroll_width) { + all_algos.push_back(&algo); + } +#endif + all_algos.push_back(&int8_nchw4_dotprod); + all_algos.push_back(&int8_chwn4_dotprod); + for (size_t i = all_algo_size; i < all_algos.size(); ++i) { + non_cudnn_algos.push_back(all_algos[i]); + } +} + +ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; + +ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvBiasForwardImpl* o, + const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) + : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias, + z, dst) {} + +ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( + ConvBiasForwardImpl* o, const TensorLayout& src, + const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) + : BiasForwardSizeArgs{concrete_handle(o->handle()), + &src, + &filter, + &bias, + &z, + filter_meta, + &dst, + o->param().nonlineMode}, + opr{o} {} + +ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs( + ConvBiasForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout, + dst.layout), + src_tensor{&src}, + filter_tensor{&filter}, + bias_tensor{&bias}, + z_tensor{&z}, + dst_tensor{&dst}, + workspace{workspace} {} + +std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { + auto&& fm = filter_meta; + MEGDNN_MARK_USED_VAR(fm); + std::string nonlinear_mode_str; + switch (nonlinear_mode) { + case param::ConvBias::NonlineMode::RELU: + nonlinear_mode_str = "RELU"; + break; + case param::ConvBias::NonlineMode::SIGMOID: + nonlinear_mode_str = "SIGMOID"; + break; + case param::ConvBias::NonlineMode::IDENTITY: + nonlinear_mode_str = "IDENTITY"; + break; + default: + megdnn_throw("invalid conv bias nonlinear mode"); + } + return megdnn_mangle(ssprintf( + "src=%s, filter=%u{%u,%u,%u,%u}, dst=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s, " + "nonlinear_mode=%s", + src_layout->to_string().c_str(), fm.group, fm.ocpg, fm.icpg, + fm.spatial[0], fm.spatial[1], dst_layout->to_string().c_str(), + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], !fm.should_flip, + src_layout->dtype.name(), dst_layout->dtype.name(), + nonlinear_mode_str.c_str())); +} + +void ConvBiasForwardImpl::AlgoPack::fill_cudnn_algos() { +#define V1(v) #v +#define V(v) V1(v) + +#define DEF_ALGO(NAME, REPROD) \ + cudnn_conv_bias_activations.push_back( \ + {REPROD, \ + "CUDNN:ConvBiasActivation:" #NAME \ + "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL), \ + NAME}); \ + cudnn_convs.push_back( \ + {REPROD, \ + "CUDNN:Convolution:" #NAME \ + "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL), \ + NAME}) + + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true); + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true); + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_GEMM, true); + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, true); + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT, true); + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true); + +#if CUDNN_MAJOR >= 5 + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, true); +#if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1 + DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, true); +#endif +#endif + +#if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) +#pragma message "not latest cudnn" +#endif + +#undef DEF_ALGO + +#undef V +#undef V1 +} + +#if CUDA_VERSION >= 10000 +void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { + int8_chwn4_imma.push_back( + {AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize::IMMA16x16x16}); + int8_chwn4_imma.push_back( + {AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize::IMMA32x8x16}); + int8_chwn4_imma.push_back( + {AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize::IMMA8x32x16}); + int8_nchw4_imma.push_back( + {AlgoInt8NCHW4IMMAImplicitGemm::MMATileSize::IMMA16x16x16}); + int8_nchw4_imma.push_back( + {AlgoInt8NCHW4IMMAImplicitGemm::MMATileSize::IMMA32x8x16}); + int8_nchw4_imma.push_back( + {AlgoInt8NCHW4IMMAImplicitGemm::MMATileSize::IMMA8x32x16}); + int8_chwn4_imma_reorder_filter.push_back( + {AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::MMATileSize:: + IMMA16x16x16}); + int8_chwn4_imma_reorder_filter.push_back( + {AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::MMATileSize:: + IMMA32x8x16}); + int8_chwn4_imma_reorder_filter.push_back( + {AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::MMATileSize:: + IMMA8x32x16}); + int8_chwn4_imma_unroll_width.push_back( + {AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::MMATileSize:: + IMMA16x16x16}); + int8_chwn4_imma_unroll_width.push_back( + {AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::MMATileSize:: + IMMA32x8x16}); + int8_chwn4_imma_unroll_width.push_back( + {AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::MMATileSize:: + IMMA8x32x16}); +} +#endif + +ConvBiasForwardImpl::AlgoBase* +ConvBiasForwardImpl::AlgoPack::cudnn_conv_from_enum( + cudnnConvolutionFwdAlgo_t algo) { + for (auto&& i : cudnn_convs) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw( + megdnn_mangle(ssprintf("can not find cudnn conv fwd algorithm %d", + static_cast(algo)))); +} + +ConvBiasForwardImpl::AlgoBase* +ConvBiasForwardImpl::AlgoPack::cudnn_conv_bias_act_from_enum( + cudnnConvolutionFwdAlgo_t algo) { + for (auto&& i : cudnn_conv_bias_activations) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw(megdnn_mangle( + ssprintf("can not find cudnn conv bias act algorithm %d", + static_cast(algo)))); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h new file mode 100644 index 00000000..7b7a042e --- /dev/null +++ b/dnn/src/cuda/conv_bias/algo.h @@ -0,0 +1,550 @@ +/** + * \file dnn/src/cuda/conv_bias/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/conv_bias/opr_impl.h" +#include "src/cuda/handle.h" +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" + +#include +#include +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for conv bias algos + * + * All the algo impls should try to support non-contiguous batch dim, for group + * conv execution. + */ +class ConvBiasForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs : public conv_bias::BiasForwardSizeArgs { + ConvBiasForwardImpl* opr; + + std::string to_string() const; + SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& bias, + const TensorLayout& z, const TensorLayout& dst); + SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, + const TensorLayout& filter, + const CanonizedFilterMeta& filter_meta, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst); + + void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const { + desc.set_conv_bias(*src_layout, filter_meta, *dst_layout, + *bias_layout, *z_layout, opr->param()); + } + + void init_conv_desc(conv_bias::CUDNNForwardDescs& desc) const { + desc.set_conv(*src_layout, filter_meta, *dst_layout, opr->param()); + } + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *filter_tensor, *bias_tensor, *z_tensor, + *dst_tensor; + Workspace workspace; + + ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_in bias, + _megdnn_tensor_in z, _megdnn_tensor_out dst, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert( + req <= workspace.size, + "conv bias fwd algo %s: required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + virtual bool is_cudnn() const { return false; } +}; + +class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase { +public: + AlgoCUDNNConvBiasActivation(bool is_reproducible, const char* name, + cudnnConvolutionFwdAlgo_t cudnn_enum) + : m_is_reproducible(is_reproducible), + m_name(ConvBiasForward::algo_name(name, {})), + m_cudnn_enum(cudnn_enum) {} + + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + param::Convolution get_param_convolution(const SizeArgs& args) const; + bool is_available(const SizeArgs&) const override; + + const char* name() const override { return m_name.c_str(); } + + bool is_reproducible() const override { return m_is_reproducible; } + + cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; } + + bool is_cudnn() const override { return true; } + +private: + bool m_is_reproducible; + std::string m_name; + cudnnConvolutionFwdAlgo_t m_cudnn_enum; +}; + +class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { + if (m_name.empty()) { + m_name = + ConvBiasForward::algo_name("CHANNEL_WISE", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + mutable std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoChanwiseSmall final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "CHANNEL_WISE_SMALL", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + mutable std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "CHANNEL_WISE_8X8X32", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + mutable std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase { +public: + AlgoCUDNNConv(bool is_reproducible, const char* name, + cudnnConvolutionFwdAlgo_t cudnn_enum) + : m_is_reproducible(is_reproducible), + m_name(ConvBiasForward::algo_name(name, {})), + m_cudnn_enum(cudnn_enum) {} + + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return m_is_reproducible; } + + const char* name() const override { return m_name.c_str(); } + + cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; } + + bool is_cudnn() const override { return true; } +private: + bool m_is_reproducible; + std::string m_name; + cudnnConvolutionFwdAlgo_t m_cudnn_enum; + + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; +}; + +//! compute small matmul in the kernel +class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "INPLACE_MATMUL", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + mutable std::string m_name; +}; + +//! im2col and matmul, with dilation +class ConvBiasForwardImpl::AlgoMatmul final : public AlgoBase { + template + static void exec_internal(const ExecArgs& args, + const WorkspaceBundle& bundle); + +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "MATMUL", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; + + mutable std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoMatmul8x8x32 final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "MATMUL8X8X32", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + bool need_src_unroll(const SizeArgs& args) const; + bool need_filter_reshape(const SizeArgs& args) const; + template + WorkspaceBundle get_bundle(const SizeArgs& args) const; + template + void exec_internal(const ExecArgs& args) const; + mutable std::string m_name; +}; + +//! optimized 1x1 conv +class ConvBiasForwardImpl::Algo1x1 final : public AlgoBase { + static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, + TensorLayout& B, TensorLayout& C); + +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "MATMUL1X1", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; + mutable std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase { + static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, + TensorLayout& B, TensorLayout& C); + +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name( + "BATCHEDMATMUL", {}); + } + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; + mutable std::string m_name; +}; + +//! implement group conv by another algo +class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase { +public: + AlgoGroupConvGeneral(AlgoBase* impl); + + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + const char* name() const override { return m_name.c_str(); } + + bool is_reproducible() const override { return m_impl->is_reproducible(); } + + static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, + TensorLayout& dst_pg, TensorLayout& bias_pg); + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; + AlgoBase* m_impl; + std::string m_name; +}; + +#if CUDA_VERSION >= 10000 +class ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA final : public AlgoBase { +public: + AlgoQUInt4x4x32WMMA() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return "QUINT4x4x32_WMMA"; } + bool is_reproducible() const override { return true; } +private: + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const; + bool use_kernel_fhxfw(const SizeArgs& args) const; + size_t get_workspace_in_bytes_do_conv(const SizeArgs& args) const; +}; +#endif + +class ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm final + : public AlgoBase { +public: + AlgoInt8CHWN4DotProdImplicitGemm() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { + return "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"; + } + bool is_reproducible() const override { return true; } + template + static void dispatch_nonlinear_mode( + const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, const int8_t* d_z, int8_t* d_dst, + const convolution::ConvParam& param, float alpha, float beta, + float gamma, float scale, cudaStream_t stream, + param::ConvBias::NonlineMode nonlinear_mode); +}; + +class ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm final + : public AlgoBase { +public: + AlgoInt8NCHW4DotProdImplicitGemm() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { + return "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"; + } + bool is_reproducible() const override { return true; } + +private: + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; +}; + +#if CUDA_VERSION >= 10000 +class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final + : public AlgoBase { +public: + enum class MMATileSize : uint32_t { + IMMA16x16x16, + IMMA32x8x16, + IMMA8x32x16 + }; + AlgoInt8CHWN4IMMAImplicitGemm(MMATileSize mma_tile_size) + : m_mma_tile_size{mma_tile_size}, + m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_" + + to_string(m_mma_tile_size)} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } + template + static void dispatch_nonlinear_mode( + const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, int8_t* d_z, int8_t* d_dst, + const convolution::ConvParam& param, float alpha, float beta, + float gamma, float scale, cudaStream_t stream, + param::ConvBias::NonlineMode nonlinear_mode, + MMATileSize mma_tile_size); + static std::string to_string(MMATileSize mma_tile_size); + +private: + MMATileSize m_mma_tile_size; + std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm final + : public AlgoBase { +public: + using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize; + AlgoInt8NCHW4IMMAImplicitGemm(MMATileSize mma_tile_size) + : m_mma_tile_size{mma_tile_size}, + m_name{"INT8_NCHW4_IMMA_IMPLICIT_GEMM_" + + AlgoInt8CHWN4IMMAImplicitGemm::to_string( + m_mma_tile_size)} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { + return m_name.c_str(); + } + bool is_reproducible() const override { return true; } +private: + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + MMATileSize m_mma_tile_size; + std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter final + : public AlgoBase { +public: + using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize; + AlgoInt8CHWN4IMMAImplicitGemmReorderFilter(MMATileSize mma_tile_size) + : m_mma_tile_size{mma_tile_size}, + m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_" + + AlgoInt8CHWN4IMMAImplicitGemm::to_string( + m_mma_tile_size)} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + bool is_reproducible() const override { return true; } + +private: + MMATileSize m_mma_tile_size; + std::string m_name; +}; + +class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth final + : public AlgoBase { +public: + using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize; + AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth(MMATileSize mma_tile_size) + : m_mma_tile_size{mma_tile_size}, + m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_" + + AlgoInt8CHWN4IMMAImplicitGemm::to_string( + m_mma_tile_size)} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + bool is_reproducible() const override { return true; } + +private: + MMATileSize m_mma_tile_size; + std::string m_name; +}; +#endif + +class ConvBiasForwardImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + std::vector all_algos, + //! non-cudnn algos, used for heuristic if cudnn is not supported + non_cudnn_algos; + std::vector cudnn_conv_bias_activations; + std::vector cudnn_convs; + AlgoChanwise chanwise; + AlgoChanwiseSmall chanwise_small; + AlgoChanwise8x8x32 chanwise8x8x32; + AlgoInplaceMatmul inplace_matmul; + AlgoMatmul matmul; + AlgoMatmul8x8x32 matmul8x8x32; + AlgoBatchedMatmul batched_matmul; + Algo1x1 a1x1; + AlgoInt8NCHW4DotProdImplicitGemm int8_nchw4_dotprod; + AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod; +#if CUDA_VERSION >= 10000 + AlgoQUInt4x4x32WMMA wmma_quint4x4x32; + std::vector int8_chwn4_imma; + std::vector int8_nchw4_imma; + std::vector + int8_chwn4_imma_reorder_filter; + std::vector + int8_chwn4_imma_unroll_width; +#endif + std::vector> gconv_refhold; + std::unordered_map algo2gconv; + + AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); + + AlgoBase* cudnn_conv_from_enum(cudnnConvolutionFwdAlgo_t algo); + +private: +#if CUDA_VERSION >= 10000 + void fill_imma_algos(); +#endif + void fill_cudnn_algos(); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/batched_matmul.cpp b/dnn/src/cuda/conv_bias/batched_matmul.cpp new file mode 100644 index 00000000..1b72a80b --- /dev/null +++ b/dnn/src/cuda/conv_bias/batched_matmul.cpp @@ -0,0 +1,120 @@ +/** + * \file dnn/src/cuda/conv_bias/batched_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + //! cudnn batched matmul with discontinuous stride has many bugs, so disable + //! here. + TensorLayout A, B, C; + extract_matmul_layouts(args, A, B, C); + if (!B.is_contiguous()) { + return false; + } + auto&& fm = args.filter_meta; + return fm.format == Param::Format::NCHW && + (fm.dtype.enumv() == DTypeEnum::Float32 || + fm.dtype.enumv() == DTypeEnum::Float16) && + fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 && + fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 && + fm.stride[1] == 1; +} + +void ConvBiasForwardImpl::AlgoBatchedMatmul::extract_matmul_layouts( + const SizeArgs& args, TensorLayout& A, TensorLayout& B, + TensorLayout& C) { + auto&& fm = args.filter_meta; + // A {N, OC, IC} + // B {N, IC, H * W} + // C {N, OC, H * W} + size_t batched = args.src_layout->shape[0]; + A = {{batched, fm.ocpg, fm.icpg}, fm.dtype}; + A.stride[0] = 0; + B.ndim = 3; + B.shape[1] = args.src_layout->shape[1]; + B.shape[2] = args.src_layout->shape[2] * args.src_layout->shape[3]; + B.shape[0] = batched; + B.stride[2] = 1; + B.stride[1] = args.src_layout->stride[1]; + B.stride[0] = args.src_layout->stride[0]; + B.dtype = args.src_layout->dtype; + C = {{args.dst_layout->shape[0], args.dst_layout->shape[1], B.shape[2]}, + args.dst_layout->dtype}; +} + +WorkspaceBundle ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + SmallVector sizes; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + TensorLayout A, B, C; + extract_matmul_layouts(conv_args, A, B, C); + sizes.insert( + sizes.begin(), + args.handle->batched_matrix_mul()->get_workspace_in_bytes(A, B, C)); + return {ptr, std::move(sizes)}; +} + +size_t ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + ExecArgs conv_args = args; + conv_args.dst_tensor = &conv_dst_tensor; + conv_args.dst_layout = &conv_dst_tensor.layout; + { + TensorND A, B, C; + extract_matmul_layouts(args, A.layout, B.layout, C.layout); + A.raw_ptr = args.filter_tensor->raw_ptr; + B.raw_ptr = args.src_tensor->raw_ptr; + C.raw_ptr = args.dst_tensor->raw_ptr; + auto mm = args.handle->batched_matrix_mul(); + mm->exec(A, B, C, bundle.get_workspace(0)); + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise.cpp b/dnn/src/cuda/conv_bias/chanwise.cpp new file mode 100644 index 00000000..cdff851e --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise.cpp @@ -0,0 +1,101 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/chanwise/kern.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoChanwise::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + auto&& fm = args.filter_meta; + bool flag = args.filter_meta.format == Param::Format::NCHW && + args.src_layout->dtype.category() == DTypeCategory::FLOAT && + args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && + fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && !fm.should_flip; + return flag; +} + +size_t ConvBiasForwardImpl::AlgoChanwise::get_workspace_in_bytes( + const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + return dst_layout.span().dist_byte(); + } + return 0; +} + +void ConvBiasForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const { + WorkspaceBundle bundle{args.workspace.raw_ptr, + {get_workspace_in_bytes(args)}}; + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + { + auto kparam = chanwise::Param::from_fwd_args(args); + auto stream = cuda_stream(args.handle); + switch (args.src_layout->dtype.enumv()) { + case DTypeEnum::Float32: + chanwise::run_fwd(conv_dst_tensor.ptr(), + args.src_tensor->ptr(), + args.filter_tensor->ptr(), kparam, + stream); + break; + case DTypeEnum::Float16: +#if CUDA_VERSION >= 9000 + if (is_compute_capability_required(5, 3)) { + chanwise::run_fwd( + static_cast(conv_dst_tensor.raw_ptr), + static_cast(args.src_tensor->raw_ptr), + static_cast(args.filter_tensor->raw_ptr), + kparam, stream); + } else { + chanwise::run_fwd(conv_dst_tensor.ptr(), + args.src_tensor->ptr(), + args.filter_tensor->ptr(), + kparam, stream); + } +#else + chanwise::run_fwd(conv_dst_tensor.ptr(), + args.src_tensor->ptr(), + args.filter_tensor->ptr(), kparam, + stream); +#endif + break; + default: + megdnn_assert_internal(0); + } + } + + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/fwd.cu b/dnn/src/cuda/conv_bias/chanwise/fwd.cu new file mode 100644 index 00000000..83e71ee2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/fwd.cu @@ -0,0 +1,367 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/fwd.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "cuda.h" +#include "cuda_fp16.h" +#include "src/cuda/conv_bias/chanwise/kern.cuh" +#include "src/cuda/conv_bias/chanwise/kern_helper.cuh" +#include "src/cuda/fp16_help.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; +using namespace chanwise; + +namespace { + +// grid idx is (inp_chl, worker_index) +// each y-slice of a block works on an (N, CHL_MUL, OH, OW) spatial image at +// given inp_chl +template +__global__ void kern_fwd_float(T* dst, const T* src, const T* flt_tot, + Param param) { + extern __shared__ uint8_t flt_storage[]; + T* const flt = reinterpret_cast(flt_storage); + + const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x, + IH = param.src_h, IW = param.src_w, + CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, + FH = FH_SET ? FH_SET : param.flt_h, + FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW, + PH = param.pad_h, PW = param.pad_w, SH = param.stride_h, + SW = param.stride_w, OH = param.out_h, OW = param.out_w, + TOT_OUT = N * CHL_MUL * OH * OW; + + block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); + + uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x, + nr_out_per_launch = blockDim.x * gridDim.y; + for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { + uint32_t out_idx = out_idx_, n, chl_mul, oh, ow; + out_idx = div_mod(out_idx, OW, ow); + out_idx = div_mod(out_idx, OH, oh); + if (CHL_MUL_SET == 1) { + chl_mul = 0; + n = out_idx; + } else { + n = div_mod(out_idx, CHL_MUL, chl_mul); + } + + int ih = int(oh * SH) - int(PH), iw = int(ow * SW) - int(PW); + const T* flt_base = flt + chl_mul * FSIZE; + const T* src_base = src + int(((n * IC + ic) * IH + ih) * IW + iw); + + T sum(0); + + if (FH_SET && FW_SET) { +#pragma unroll + for (uint32_t fh = 0; fh < FH; ++fh) { + // fh + ih < 0 would overflow, so we do not need to check it + if (static_cast(fh + ih) < IH) { +#pragma unroll + for (uint32_t fw = 0; fw < FW; ++fw) { + if (static_cast(fw + iw) < IW) { + sum += flt_base[fh * FW + fw] * + src_base[fh * IW + fw]; + } + } + } + } + } else { + int fhmax = min(int(FH), int(IH - ih)), + fwmax = min(int(FW), int(IW - iw)); + for (int fh = max(0, -ih); fh < fhmax; ++fh) { + for (int fw = max(0, -iw); fw < fwmax; ++fw) { + sum += flt_base[fh * FW + fw] * src_base[fh * IW + fw]; + } + } + } + dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow] = sum; + } +} + +#if CUDA_VERSION >= 9000 +template +__global__ void kern_fwd_half(__half* dst, const __half* src, + const __half* flt_tot, Param param) { + extern __shared__ uint8_t flt_storage[]; + __half* const flt = reinterpret_cast<__half*>(flt_storage); + + const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x, + IH = param.src_h, IW = param.src_w, + CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, + FH = FH_SET ? FH_SET : param.flt_h, + FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW, + PH = param.pad_h, PW = param.pad_w, SH = param.stride_h, + SW = param.stride_w, OH = param.out_h, OW = param.out_w, + TOT_OUT = N * CHL_MUL * OH * OW; + + block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); + + uint32_t out_idx_ = (blockIdx.y * blockDim.x + threadIdx.x) * 2, + nr_out_per_launch = (blockDim.x * gridDim.y) * 2; + for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { + if (out_idx_ % OW < OW - 1) { + uint32_t out_idx = out_idx_, n, chl_mul, oh, ow; + out_idx = div_mod(out_idx, OW, ow); + out_idx = div_mod(out_idx, OH, oh); + if (CHL_MUL_SET == 1) { + chl_mul = 0; + n = out_idx; + } else { + n = div_mod(out_idx, CHL_MUL, chl_mul); + } + + int ih = int(oh * SH) - int(PH), iw = int(ow * SW) - int(PW); + const __half* flt_base = flt + chl_mul * FSIZE; + const __half* src_base = + src + int(((n * IC + ic) * IH + ih) * IW + iw); + + __half2 sum{0.0, 0.0}; + +#pragma unroll + for (uint32_t fh = 0; fh < FH; ++fh) { + // fh + ih < 0 would overflow, so we do not need to + // check it + if (static_cast(fh + ih) < IH) { + if (FH_SET == 3 && FW_SET == 3 && SW_SET == 1) { + __half2 fil0 = {flt_base[fh * FW], flt_base[fh * FW]}; + __half2 fil1 = {flt_base[fh * FW + 1], + flt_base[fh * FW + 1]}; + __half2 fil2 = {flt_base[fh * FW + 2], + flt_base[fh * FW + 2]}; + + __half2 src0 = {0.0, 0.0}; + if (static_cast(iw) < IW) + src0.x = src_base[fh * IW]; + if (static_cast(iw + 1) < IW) + src0.y = src_base[fh * IW + 1]; + sum = fma2(src0, fil0, sum); + + __half2 src2 = {0.0, 0.0}; + if (static_cast(iw + 2) < IW) + src2.x = src_base[fh * IW + 2]; + if (static_cast(iw + 3) < IW) + src2.y = src_base[fh * IW + 3]; + sum = fma2(src2, fil2, sum); + + __half2 src1 = {src0.y, src2.x}; + sum = fma2(src1, fil1, sum); + } else if (FH_SET == 5 && FW_SET == 5 && SW_SET == 1) { + __half2 fil0 = {flt_base[fh * FW], flt_base[fh * FW]}; + __half2 fil1 = {flt_base[fh * FW + 1], + flt_base[fh * FW + 1]}; + __half2 fil2 = {flt_base[fh * FW + 2], + flt_base[fh * FW + 2]}; + __half2 fil3 = {flt_base[fh * FW + 3], + flt_base[fh * FW + 3]}; + __half2 fil4 = {flt_base[fh * FW + 4], + flt_base[fh * FW + 4]}; + + __half2 src0 = {0.0, 0.0}; + if (static_cast(iw) < IW) + src0.x = src_base[fh * IW]; + if (static_cast(iw + 1) < IW) + src0.y = src_base[fh * IW + 1]; + sum = fma2(src0, fil0, sum); + + __half2 src2 = {0.0, 0.0}; + if (static_cast(iw + 2) < IW) + src2.x = src_base[fh * IW + 2]; + if (static_cast(iw + 3) < IW) + src2.y = src_base[fh * IW + 3]; + sum = fma2(src2, fil2, sum); + + __half2 src1 = {src0.y, src2.x}; + sum = fma2(src1, fil1, sum); + + __half2 src4 = {0.0, 0.0}; + if (static_cast(iw + 4) < IW) + src4.x = src_base[fh * IW + 4]; + if (static_cast(iw + 5) < IW) + src4.y = src_base[fh * IW + 5]; + sum = fma2(src4, fil4, sum); + + __half2 src3 = {src2.y, src4.x}; + sum = fma2(src3, fil3, sum); + + } else { +#pragma unroll + for (uint32_t fw = 0; fw < FW; ++fw) { + __half2 fil = {flt_base[fh * FW + fw], + flt_base[fh * FW + fw]}; + __half2 src = {0.0, 0.0}; + if (static_cast(static_cast(fw) + + iw) < IW) + src.x = src_base[fh * IW + fw]; + if (static_cast(static_cast(fw) + + iw + SW) < IW) + src.y = src_base[fh * IW + fw + SW]; + sum = fma2(src, fil, sum); + } + } + } + } + + dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow] = + sum.x; + dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow + 1] = + sum.y; + + continue; + } + // two discontinuous output + for (size_t offset = 0; offset < 2; ++offset) { + uint32_t out_idx = out_idx_ + offset, n, chl_mul, oh, ow; + out_idx = div_mod(out_idx, OW, ow); + out_idx = div_mod(out_idx, OH, oh); + if (CHL_MUL_SET == 1) { + chl_mul = 0; + n = out_idx; + } else { + n = div_mod(out_idx, CHL_MUL, chl_mul); + } + + int ih = int(oh * SH) - int(PH), iw = int(ow * SW) - int(PW); + const __half* flt_base = flt + chl_mul * FSIZE; + const __half* src_base = + src + int(((n * IC + ic) * IH + ih) * IW + iw); + + __half sum(0); + + if (FH_SET && FW_SET) { +#pragma unroll + for (uint32_t fh = 0; fh < FH; ++fh) { + // fh + ih < 0 would overflow, so we do not need to + // check it + if (static_cast(fh + ih) < IH) { +#pragma unroll + for (uint32_t fw = 0; fw < FW; ++fw) { + if (static_cast(fw + iw) < IW) { + sum = fma(flt_base[fh * FW + fw], + src_base[fh * IW + fw], sum); + } + } + } + } + } else { + int fhmax = min(int(FH), int(IH - ih)), + fwmax = min(int(FW), int(IW - iw)); + for (int fh = max(0, -ih); fh < fhmax; ++fh) { + for (int fw = max(0, -iw); fw < fwmax; ++fw) { + sum = fma(flt_base[fh * FW + fw], + src_base[fh * IW + fw], sum); + } + } + } + dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow] = + sum; + + if (n == N - 1 && chl_mul == CHL_MUL - 1 && ow == OW - 1 && + oh == OH - 1) + break; + } + } +} +#endif + +#define SET_SW(func, type, sw) \ + if (param.flt_h == 2 && param.flt_w == 2) { \ + kern = func; \ + } else if (param.flt_h == 3 && param.flt_w == 3) { \ + kern = func; \ + } else if (param.flt_h == 5 && param.flt_w == 5) { \ + kern = func; \ + } else if (param.flt_h == 7 && param.flt_w == 7) { \ + kern = func; \ + } else { \ + kern = func; \ + } + +#define GET_KERN(func, type) \ + void (*kern)(type*, const type*, const type*, Param); \ + if (param.chl_mul == 1) { \ + if (param.stride_w == 1) { \ + SET_SW(func, type, 1) \ + } else { \ + SET_SW(func, type, 0) \ + } \ + } else { \ + kern = func; \ + } \ + return kern; + +template +void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); + +template <> +void (*get_kern(const Param& param))(float*, const float*, const float*, + const Param) { + GET_KERN(kern_fwd_float, float); +} + +#if CUDA_VERSION >= 9000 +template <> +void (*get_kern<__half>(const Param& param))(__half*, const __half*, + const __half*, const Param) { + GET_KERN(kern_fwd_half, __half); +} +#endif + +template <> +void (*get_kern(const Param& param))(dt_float16*, const dt_float16*, + const dt_float16*, + const Param) { + GET_KERN(kern_fwd_float, dt_float16); +} + +#undef SET_SW +#undef GET_KERN + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace conv_bias { +namespace chanwise { + +template +void run_fwd(T* dst, const T* src, const T* flt, const Param& param, + cudaStream_t stream) { + void (*kern)(T*, const T*, const T*, Param); + kern = get_kern(param); + + int nr_thread = query_blocksize_for_kernel(kern), + nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul; + dim3 nr_block(param.src_chl, + std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); + uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); + kern<<>>(dst, src, flt, param); + after_kernel_launch(); +} + +template void run_fwd(float*, const float*, const float*, const Param&, + cudaStream_t); + +#if CUDA_VERSION >= 9000 +template void run_fwd(__half*, const __half*, const __half*, const Param&, + cudaStream_t); +#endif + +template void run_fwd(dt_float16*, const dt_float16*, const dt_float16*, + const Param&, cudaStream_t); + +} // namespace chanwise +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/fwd_8x8x32.cu b/dnn/src/cuda/conv_bias/chanwise/fwd_8x8x32.cu new file mode 100644 index 00000000..3dbf9a52 --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/fwd_8x8x32.cu @@ -0,0 +1,209 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/fwd_8x8x32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/chanwise/kern.cuh" + +#include +#include + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; +using namespace chanwise; + +namespace { + +__host__ __device__ void get_receptive_field_size(uint32_t OH, uint32_t OW, + uint32_t FH, uint32_t FW, + uint32_t SH, uint32_t SW, + uint32_t DH, uint32_t DW, + uint32_t* RH, uint32_t* RW) { + // DFH = dilationd FH, DFW = dilationd FW + // RH = receptive field height, RW = receptive field width + uint32_t DFH = (FH - 1) * DH + 1, DFW = (FW - 1) * DW + 1; + *RH = ((OH - 1) * SH + 1) + DFH - 1; + *RW = ((OW - 1) * SW + 1) + DFW - 1; +} + +// 32x4x4 threads +// assume that C must be multiples of 4 +// F == 0: FH/FW should be retrieved from param +// F != 0: FH/FW should use F +template +__global__ void kern(int32_t* dst, const int8_t* src, const int8_t* flt, + Param param) { + // each block would process 128 channels at every 4x4 spatial area. + uint32_t C = param.src_chl, IH = param.src_h, IW = param.src_w, + OH = param.out_h, OW = param.out_w, FH = F == 0 ? param.flt_h : F, + FW = F == 0 ? param.flt_w : F, PH = param.pad_h, PW = param.pad_w, + SH = param.stride_h, SW = param.stride_w, DH = param.dilation_h, + DW = param.dilation_w; + + const uint32_t* src_32 = reinterpret_cast(src); + const uint32_t* flt_32 = reinterpret_cast(flt); + uint32_t bidx = blockIdx.x, bidy = blockIdx.y, bidz = blockIdx.z; + uint32_t c_beg = blockIdx.x * 128, c_end = min((blockIdx.x + 1) * 128, C), + c_cur = c_beg + threadIdx.x * 4; + uint32_t tidx = threadIdx.x, tidy = threadIdx.y, tidz = threadIdx.z, + tid = (tidx << 0) | (tidy << 5) | (tidz << 7), + tid_stride = 32 * 4 * 4, tidyz = (tidy << 0) | (tidz << 2), + tidyz_stride = 4 * 4; + uint32_t oh = bidz * 4 + tidz, ow = bidy * 4 + tidy; + uint32_t C_32 = C >> 2; + // calculate receptive field of 4x4 output pixels + uint32_t RH, RW; + get_receptive_field_size(4, 4, FH, FW, SH, SW, DH, DW, &RH, &RW); + + extern __shared__ int8_t shared[]; + + int8_t* flt_shared_tmp = static_cast(static_cast(shared)); + uint32_t* flt_shared_tmp_32 = reinterpret_cast(flt_shared_tmp); + + int8_t* flt_shared = static_cast( + static_cast(shared + 128 * FH * FW * sizeof(int8_t))); + uint32_t* flt_shared_32 = reinterpret_cast(flt_shared); + + int8_t* src_shared = static_cast( + static_cast(shared + 128 * FH * FW * sizeof(int8_t) + + 128 * FH * FW * sizeof(int8_t))); + uint32_t* src_shared_32 = reinterpret_cast(src_shared); + + int32_t* dst_shared = static_cast(static_cast( + shared + 128 * FH * FW * sizeof(int8_t) + + 128 * FH * FW * sizeof(int8_t) + 128 * RH * RW * sizeof(int8_t))); + + // read original filter to shared memory + // *_int8 vars must be multiples of 4 here. + uint32_t flt_offset = c_beg * FH * FW; + uint32_t flt_offset_32 = flt_offset >> 2; + uint32_t flt_amount = (c_end - c_beg) * FH * FW; + uint32_t flt_amount_32 = flt_amount >> 2; + for (uint32_t id = tid; id < flt_amount_32; id += tid_stride) { + flt_shared_tmp_32[id] = flt_32[flt_offset_32 + id]; + } + __syncthreads(); + // transpose filter: (flt_amount, FH*FW) -> (FH*FW, 128) + // typical example: (128, 9) -> (9, 128) + for (uint32_t idyz = tidyz; idyz < FH * FW; idyz += tidyz_stride) + for (uint32_t idx = tidx; idx < 128; idx += 32) { + uint32_t from_idx = idx * FH * FW + idyz; + uint32_t to_idx = idx + idyz * 128; + if (from_idx < flt_amount) { + flt_shared[to_idx] = flt_shared_tmp[from_idx]; + } else { + flt_shared[to_idx] = 0; + } + } + // no need to sync here + // __syncthreads(); + // read (RH, RW, 128) src from global to shared + for (uint32_t rh = tidz; rh < RH; rh += 4) + for (uint32_t rw = tidy; rw < RW; rw += 4) { + uint32_t ih = bidz * 4 * SH + rh - PH; + uint32_t iw = bidy * 4 * SW + rw - PW; + uint32_t to_idx = (rh * RW + rw) * 32 + tidx; + uint32_t c_32 = bidx * 32 + tidx; + uint32_t from_idx = (ih * IW + iw) * C_32 + c_32; + if (ih < IH && iw < IW && c_32 < C_32) { + src_shared_32[to_idx] = src_32[from_idx]; + } else { + src_shared_32[to_idx] = 0; + } + } + __syncthreads(); + // do convolution + if (c_cur < c_end && oh < OH && ow < OW) { + int32_t dst0 = 0, dst1 = 0, dst2 = 0, dst3 = 0; +#pragma unroll + for (uint32_t fh = 0; fh < FH; ++fh) +#pragma unroll + for (uint32_t fw = 0; fw < FW; ++fw) { + uint32_t rh = tidz * SH + fh * DH, rw = tidy * SW + fw * DW; + uint32_t sval_32 = src_shared_32[(rh * RW + rw) * 32 + tidx]; + int32_t sval0 = int8_t((sval_32 >> 0) & 255), + sval1 = int8_t((sval_32 >> 8) & 255), + sval2 = int8_t((sval_32 >> 16) & 255), + sval3 = int8_t((sval_32 >> 24) & 255); + uint32_t fval_32 = flt_shared_32[(fh * FW + fw) * 32 + tidx]; + int32_t fval0 = int8_t((fval_32 >> 0) & 255), + fval1 = int8_t((fval_32 >> 8) & 255), + fval2 = int8_t((fval_32 >> 16) & 255), + fval3 = int8_t((fval_32 >> 24) & 255); + dst0 += sval0 * fval0; + dst1 += sval1 * fval1; + dst2 += sval2 * fval2; + dst3 += sval3 * fval3; + } + dst_shared[tidyz * 129 + tidx * 4 + 0] = dst0; + dst_shared[tidyz * 129 + tidx * 4 + 1] = dst1; + dst_shared[tidyz * 129 + tidx * 4 + 2] = dst2; + dst_shared[tidyz * 129 + tidx * 4 + 3] = dst3; + } + __syncthreads(); + if (oh < OH && ow < OW) { +#pragma unroll + for (uint32_t k = 0; k < 4; ++k) { + uint32_t c = c_beg + tidx + k * 32; + if (c < c_end) { + dst[(oh * OW + ow) * C + c] = + dst_shared[tidyz * 129 + tidx + k * 32]; + } + } + } +} + +} // anonymous namespace + +void megdnn::cuda::conv_bias::chanwise::run_fwd_8x8x32(int32_t* dst, + const int8_t* src, + const int8_t* flt, + const Param& param, + cudaStream_t stream) { + uint32_t N = param.batch, C = param.src_chl, IH = param.src_h, + IW = param.src_w, OH = param.out_h, OW = param.out_w, + FH = param.flt_h, FW = param.flt_w, SH = param.stride_h, + SW = param.stride_w, DH = param.dilation_h, DW = param.dilation_w; + + dim3 threads(32, 4, 4); + dim3 blocks(DIVUP(C, 128), DIVUP(OW, 4), DIVUP(OH, 4)); + + // shared mem size: filter*2 + src + dst + // filter + uint32_t filter_shared_mem_size = 128 * FH * FW * sizeof(int8_t); + // src + uint32_t RH, RW; + get_receptive_field_size(4, 4, FH, FW, SH, SW, DH, DW, &RH, &RW); + uint32_t src_shared_mem_size = 128 * RH * RW * sizeof(int8_t); + // dst + // use 129 instead of 128 to avoid shared memory bank conflict + uint32_t dst_shared_mem_size = 129 * 4 * 4 * sizeof(int32_t); + + uint32_t shared_mem_size = 2 * filter_shared_mem_size + + src_shared_mem_size + dst_shared_mem_size; + + void (*kptr)(int32_t*, const int8_t*, const int8_t*, Param) = kern<0>; + if (FH == 1 && FW == 1) + kptr = kern<1>; + if (FH == 3 && FW == 3) + kptr = kern<3>; + if (FH == 5 && FW == 5) + kptr = kern<5>; + + for (uint32_t n = 0; n < N; ++n) { + int32_t* dptr = dst + n * C * OH * OW; + const int8_t* sptr = src + n * C * IH * IW; + const int8_t* fptr = flt; + kptr<<>>(dptr, sptr, fptr, + param); + } + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/fwd_small.cu b/dnn/src/cuda/conv_bias/chanwise/fwd_small.cu new file mode 100644 index 00000000..b2afe2a5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/fwd_small.cu @@ -0,0 +1,294 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/fwd_small.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "cuda.h" +#include "cuda_fp16.h" +#include "src/cuda/conv_bias/chanwise/kern.cuh" +#include "src/cuda/conv_bias/chanwise/kern_helper.cuh" +#include "src/cuda/conv_bias/chanwise/launch_config.cuh" +#include "src/cuda/fp16_help.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; +using namespace chanwise; + +namespace { + +enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD }; + +// CUDA kernel to compute the depthwise convolution forward pass in NCHW format, +// tailored for small images up to 32x32. Stride and depth multiplier must be 1. +// Padding must be 'SAME', which allows to reuse the index computation. Only +// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true. +// Tiles of the input and filter tensors are loaded into shared memory before +// performing the convolution. Each thread handles two elements per iteration, +// one each in the lower and upper half of a tile. +// Backprop input direction is the same as forward direction with the filter +// rotated by 180°. +template +__global__ void +#if __CUDA_ARCH__ >= 750 +__launch_bounds__(1024, 1) +#else +__launch_bounds__(1024, 2) +#endif + DepthwiseConv2dGPUKernelNCHWSmall(const Param param, const T* input, + const T* filter, T* output) { + // Holds block plus halo and filter data for blockDim.z depths. + extern __shared__ __align__(8) unsigned char shared_memory[]; + static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); + T* const shared_data = reinterpret_cast(shared_memory); + + const int num_batches = static_cast(param.batch); + const int in_height = static_cast(param.src_h); + const int in_width = static_cast(param.src_w); + const int in_depth = static_cast(param.src_chl); + const int filter_height = kKnownFilterHeight < 0 + ? static_cast(param.flt_h) + : kKnownFilterHeight; + const int filter_width = kKnownFilterWidth < 0 + ? static_cast(param.flt_w) + : kKnownFilterWidth; + const int pad_height = static_cast(param.pad_h); + const int pad_width = static_cast(param.pad_w); + + // Fixed blockDim.z, tailored for maximum grid size for images of size + // 16x16. assert(blockDim.x == param.src_w); assert(blockDim.z == + // kBlockDepth); + const int block_height = blockDim.y; + + // These values are the same for all threads and could + // be precomputed on the CPU. + const int block_pixels = in_width * block_height; + const int block_size = block_pixels * kBlockDepth; + const int in_pixels = in_width * in_height; + const int in_increment = in_width - 1; + const int filter_pixels = filter_height * filter_width; + const int tile_width = in_width + filter_width - 1; + const int even_height = kKnownEvenHeight || (1 & ~in_height); + const int tile_height = in_height + filter_height - even_height; + const int tile_pixels = tile_width * tile_height; + const int tile_size = tile_pixels * kBlockDepth; + const int tile_offset = block_height * tile_width; + const int pad_offset = pad_height * tile_width + pad_width; + const int in_total_depth = in_depth * num_batches; + const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth; + + const int thread_col = threadIdx.x; + const int thread_row = threadIdx.y; + const int thread_depth = threadIdx.z; + + // Position in block. + const int thread_pix = thread_row * in_width + thread_col; + const int thread_idx = thread_depth * block_pixels + thread_pix; + + // Initialize tile, in particular the padding. + for (int i = thread_idx; i < tile_size; i += block_size) { + shared_data[i] = T(); + } + __syncthreads(); + + // Position in tensors. + const int tensor_idx = thread_depth * in_pixels + thread_pix; + + // Position in (padded) shared memory. + const int data_pix = thread_row * tile_width + thread_col; + const int data_idx = thread_depth * tile_pixels + data_pix; + + // Position in shared memory, offset by pad_height / pad_width. + const int tile_idx = data_idx + pad_offset; + + // Filter is always in HWCK format, irrespective of the input/output format. + const int filter_pix = thread_idx / kBlockDepth; + const int filter_channel = thread_idx % kBlockDepth; + + const int max_channel = in_total_depth - thread_depth; + const int filter_write_offset = + filter_pix < filter_pixels ? tile_size + thread_idx : 0; + const int filter_read_offset = + tile_size + thread_depth + + (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth); + const bool skip_second = + !kKnownEvenHeight && thread_row + (in_height & 1) == block_height; + + for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { + const int channel = b * kBlockDepth; + + const int inout_offset = channel * in_pixels + tensor_idx; + const bool channel_in_range = channel < max_channel; + + if (channel_in_range) { + const T* const in_ptr = inout_offset + input; + T* const tile_ptr = tile_idx + shared_data; + tile_ptr[0] = *in_ptr; + if (!skip_second) { + tile_ptr[tile_offset] = *(block_pixels + in_ptr); + } + } + + if (filter_write_offset != 0) { + const int filter_offset = + (channel + filter_channel) % in_depth * filter_pixels + + filter_pix; + shared_data[filter_write_offset] = *(filter_offset + filter); + } + + // Note: the condition to reach this is uniform across the entire block. + __syncthreads(); + + if (channel_in_range) { + T2 sum = {0.0, 0.0}; + int shared_offset = data_idx; + const T* filter_ptr = filter_read_offset + shared_data; +#pragma unroll + for (int r = 0; r < filter_height; ++r) { +#pragma unroll + for (int c = 0; c < filter_width; ++c) { + if (kDirection == DIRECTION_BACKWARD) { + filter_ptr -= kBlockDepth; + } + const T2 filter_value = {*filter_ptr, *filter_ptr}; + const T* const tile_ptr = shared_offset + shared_data; + const T2 tile_value = {tile_ptr[0], tile_ptr[tile_offset]}; + sum = fma2(filter_value, tile_value, sum); + ++shared_offset; + if (kDirection == DIRECTION_FORWARD) { + filter_ptr += kBlockDepth; + } + } + shared_offset += in_increment; + } + T* const out_ptr = inout_offset + output; + out_ptr[0] = static_cast(sum.x); + if (!skip_second) { + out_ptr[block_pixels] = static_cast(sum.y); + } + } + + // Note: the condition to reach this is uniform across the entire block. + __syncthreads(); + } +} + +template +void LaunchDepthwiseConv2dGPUSmall(const Param& param, const T* input, + const T* filter, T* output, + cudaStream_t stream) { + const int block_height = (param.src_h + 1) / 2; + dim3 block_dim; + int block_count; + void (*kernel)(const Param, const T*, const T*, T*); + block_dim = dim3(param.src_w, block_height, kBlockDepth); + block_count = + DIVUP(param.batch * param.src_chl * param.chl_mul, kBlockDepth) * + kBlockDepth; + kernel = DepthwiseConv2dGPUKernelNCHWSmall< + T, T2, kDirection, kKnownFilterWidth, kKnownFilterHeight, + kBlockDepth, kKnownEvenHeight>; + const int tile_width = param.src_w + param.flt_w - 1; + const int tile_height = block_height * 2 + param.flt_h - 1; + const int tile_pixels = tile_height * tile_width; + const int filter_pixels = param.flt_h * param.flt_w; + const int shared_memory_size = + kBlockDepth * (tile_pixels + filter_pixels) * sizeof(T); + const int num_outputs = param.out_h * param.out_w * block_count; + + block_count = GetFixedBlockSize(num_outputs, kernel, shared_memory_size, + block_dim.x * block_dim.y * block_dim.z); + kernel<<>>( + param, input, filter, output); + after_kernel_launch(); +} + +template +void LaunchDepthwiseConv2dGPUSmall(const Param& param, const T* input, + const T* filter, T* output, + cudaStream_t stream) { + if (param.src_h & 1) { + return LaunchDepthwiseConv2dGPUSmall< + T, T2, kDirection, kKnownFilterWidth, kKnownFilterHeight, + kBlockDepth, false>(param, input, filter, output, stream); + } else { + return LaunchDepthwiseConv2dGPUSmall< + T, T2, kDirection, kKnownFilterWidth, kKnownFilterHeight, + kBlockDepth, true>(param, input, filter, output, stream); + } +} + +template +void LaunchDepthwiseConv2dGPUSmall(const Param& param, const T* input, + const T* filter, T* output, + cudaStream_t stream) { + // Maximize (power of two) kBlockDepth while keeping a block within 1024 + // threads (2 pixels per thread). + const int block_pixels = (param.src_h + 1) / 2 * param.src_w; + if (block_pixels > 256) { + LaunchDepthwiseConv2dGPUSmall( + param, input, filter, output, stream); + } else if (block_pixels > 128) { + LaunchDepthwiseConv2dGPUSmall( + param, input, filter, output, stream); + } else { + LaunchDepthwiseConv2dGPUSmall( + param, input, filter, output, stream); + } +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace conv_bias { +namespace chanwise { + +// =====================================fwd===================================== +#define LAUNCH(type, type2) \ + if (param.flt_h == 3 && param.flt_w == 3) { \ + LaunchDepthwiseConv2dGPUSmall< \ + type, type2, DepthwiseConv2dDirection::DIRECTION_FORWARD, 3, \ + 3>(param, src, flt, dst, stream); \ + } else { \ + LaunchDepthwiseConv2dGPUSmall< \ + type, type2, DepthwiseConv2dDirection::DIRECTION_FORWARD, -1, \ + -1>(param, src, flt, dst, stream); \ + } + +template <> +void run_fwd_small(float* dst, const float* src, const float* flt, + const Param& param, cudaStream_t stream) { + LAUNCH(float, float2); +} + +#if CUDA_VERSION >= 9000 +template <> +void run_fwd_small(__half* dst, const __half* src, const __half* flt, + const Param& param, cudaStream_t stream) { + LAUNCH(__half, __half2); +} +#endif +#undef LAUNCH + +} // namespace chanwise +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/kern.cuh b/dnn/src/cuda/conv_bias/chanwise/kern.cuh new file mode 100644 index 00000000..b8f878b4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/kern.cuh @@ -0,0 +1,73 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/utils.cuh" + +#include +#include + +#if MEGDNN_CC_HOST +#include "src/cuda/conv_bias/helper.h" +#endif + +namespace megdnn { +namespace cuda { +namespace conv_bias { +namespace chanwise { + +struct Param { + uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w; +#if MEGDNN_CC_HOST + static Param from_fwd_args(const BiasForwardSizeArgs& args) { +#define U(v) static_cast(v) + auto&& src = args.src_layout->shape; + auto&& dst = args.dst_layout->shape; + auto&& fm = args.filter_meta; + size_t c_pos, hw_pos; + if (fm.format == param::Convolution::Format::NCHW) { + c_pos = 1; + hw_pos = 2; + } else { + c_pos = 3; + hw_pos = 1; + } + return { + U(src[0]), U(src[c_pos]), U(src[hw_pos]), + U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), + U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), + U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), + U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), + }; +#undef U + } +#endif +}; + +template +void run_fwd(T* dst, const T* src, const T* flt, const Param& param, + cudaStream_t stream); + +template +void run_fwd_small(T* dst, const T* src, const T* flt, const Param& param, + cudaStream_t stream); + +// implemented in fwd_8x8x32.cu +void run_fwd_8x8x32(int32_t* dst, const int8_t* src, const int8_t* flt, + const Param& param, cudaStream_t stream); + +} // namespace chanwise +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/kern_helper.cuh b/dnn/src/cuda/conv_bias/chanwise/kern_helper.cuh new file mode 100644 index 00000000..3d44e33a --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/kern_helper.cuh @@ -0,0 +1,54 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/kern_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/dtype.h" +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.cuh" + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace conv_bias { +namespace chanwise { + +/*! + * \brief return a / b and set mod to a % b + */ +__device__ __forceinline__ uint32_t div_mod(uint32_t a, uint32_t b, + uint32_t& mod) { + uint32_t ret = a / b; + mod = a - ret * b; + return ret; +} + +/*! + * \brief copy a 2D matrix by all threads in a block + * \param rs row stride + */ +template +__device__ __forceinline__ void block_memcpy(T* dst, const T* src, + uint32_t size) { + for (uint32_t i = threadIdx.x; i < size; i += blockDim.x) { + dst[i] = src[i]; + } + __syncthreads(); +} + +} // namespace chanwise +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/launch_config.cpp b/dnn/src/cuda/conv_bias/chanwise/launch_config.cpp new file mode 100644 index 00000000..cacf081a --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/launch_config.cpp @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/launch_config.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/conv_bias/chanwise/launch_config.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +int chanwise::GetFixedBlockSize1(int work_element_count, const void* func, + int dynamic_shared_memory_size, + int fixed_block_size) { + int block_count = 0; + + cuda_check(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &block_count, func, fixed_block_size, dynamic_shared_memory_size)); + block_count = std::min( + block_count * cuda::current_device_prop().multiProcessorCount, + DIVUP(work_element_count, fixed_block_size)); + + return block_count; +} + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise/launch_config.cuh b/dnn/src/cuda/conv_bias/chanwise/launch_config.cuh new file mode 100644 index 00000000..dfdf494e --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise/launch_config.cuh @@ -0,0 +1,35 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise/launch_config.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_bias { +namespace chanwise { + +int GetFixedBlockSize1(int work_element_count, const void* func, + int dynamic_shared_memory_size, int fixed_block_size); + +template +int GetFixedBlockSize(int work_element_count, DeviceFunc func, + int dynamic_shared_memory_size, int fixed_block_size) { + return GetFixedBlockSize1(work_element_count, + reinterpret_cast(func), + dynamic_shared_memory_size, fixed_block_size); +} + +} // namespace chanwise +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp b/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp new file mode 100644 index 00000000..4ed02ff1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp @@ -0,0 +1,74 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise_8x8x32.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" + +#include "src/cuda/utils.h" +#include "src/cuda/conv_bias/chanwise/kern.cuh" +#include "src/common/conv_bias.h" +#include "src/common/elemwise/kern_defs.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoChanwise8x8x32::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + using NonlineMode = param::ConvBias::NonlineMode; + + auto&& fm = args.filter_meta; + return (args.nonlinear_mode == NonlineMode::IDENTITY || + args.nonlinear_mode == NonlineMode::RELU) && + args.filter_meta.format == Param::Format::NHWC && + args.src_layout->dtype == dtype::Int8() && + fm.dtype.enumv() == DTypeEnum::Int8 && fm.spatial_ndim == 2 && + fm.icpg == 1 && fm.ocpg == 1 && fm.group % 4 == 0; +} + +size_t ConvBiasForwardImpl::AlgoChanwise8x8x32::get_workspace_in_bytes( + const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + return dst_layout.span().dist_byte(); + } + return 0; +} + +void ConvBiasForwardImpl::AlgoChanwise8x8x32::exec(const ExecArgs& args) const { + WorkspaceBundle bundle{args.workspace.raw_ptr, + {get_workspace_in_bytes(args)}}; + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + { + auto kparam = chanwise::Param::from_fwd_args(args); + auto stream = cuda_stream(args.handle); + chanwise::run_fwd_8x8x32(conv_dst_tensor.ptr(), + args.src_tensor->ptr(), + args.filter_tensor->ptr(), kparam, + stream); + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/chanwise_small.cpp b/dnn/src/cuda/conv_bias/chanwise_small.cpp new file mode 100644 index 00000000..3c8d4dca --- /dev/null +++ b/dnn/src/cuda/conv_bias/chanwise_small.cpp @@ -0,0 +1,100 @@ +/** + * \file dnn/src/cuda/conv_bias/chanwise_small.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/chanwise/kern.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +namespace { +inline bool is_available_small(const chanwise::Param& param) { + return param.chl_mul == 1 && param.stride_h == 1 && param.stride_w == 1 && + param.src_h <= 32 && param.src_w <= 32 && + param.src_h == param.out_h && param.src_w == param.out_w && + param.pad_h < param.flt_h && param.pad_w < param.flt_w && + param.flt_h * param.flt_w <= (param.src_h + 1) / 2 * param.src_w; +} +} // anonymous namespace + +bool ConvBiasForwardImpl::AlgoChanwiseSmall::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; +#if CUDA_VERSION < 9000 + if (args.src_layout->dtype.enumv() == DTypeEnum::Float16) + return false; +#endif + auto param = chanwise::Param::from_fwd_args(args); + auto&& fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCHW && + args.src_layout->dtype.category() == DTypeCategory::FLOAT && + args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && + fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && !fm.should_flip && is_available_small(param); +} + +size_t ConvBiasForwardImpl::AlgoChanwiseSmall::get_workspace_in_bytes( + const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + return dst_layout.span().dist_byte(); + } + return 0; +} + +void ConvBiasForwardImpl::AlgoChanwiseSmall::exec(const ExecArgs& args) const { + WorkspaceBundle bundle{args.workspace.raw_ptr, + {get_workspace_in_bytes(args)}}; + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + { + auto kparam = chanwise::Param::from_fwd_args(args); + auto stream = cuda_stream(args.handle); + switch (args.src_layout->dtype.enumv()) { + case DTypeEnum::Float32: + chanwise::run_fwd_small(conv_dst_tensor.ptr(), + args.src_tensor->ptr(), + args.filter_tensor->ptr(), + kparam, stream); + break; +#if CUDA_VERSION >= 9000 + case DTypeEnum::Float16: + chanwise::run_fwd_small( + static_cast(conv_dst_tensor.raw_ptr), + static_cast(args.src_tensor->raw_ptr), + static_cast(args.filter_tensor->raw_ptr), kparam, + stream); + break; +#endif + default: + megdnn_assert_internal(0); + } + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/conv_bias_int8.cuh b/dnn/src/cuda/conv_bias/conv_bias_int8.cuh new file mode 100644 index 00000000..e9cc68eb --- /dev/null +++ b/dnn/src/cuda/conv_bias/conv_bias_int8.cuh @@ -0,0 +1,145 @@ +/** + * \file dnn/src/cuda/conv_bias/conv_bias_int8.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_bias_int8 { + +struct LaunchConfig { + int nr_threads_x; + int nr_threads_y; + int nr_threads_z; + int nr_blocks_x; + int nr_blocks_y; + int nr_blocks_z; + int smem_size_in_bytes; + LaunchConfig() + : nr_threads_x{1}, + nr_threads_y{1}, + nr_threads_z{1}, + nr_blocks_x{1}, + nr_blocks_y{1}, + nr_blocks_z{1}, + smem_size_in_bytes{1} {} +}; + +template +void do_conv_bias_int8_implicit_gemm_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +template +void do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const convolution::ConvParam& param, float alpha, + float beta, cudaStream_t stream); + +} // namespace conv_bias_int8 +} // namespace cuda +} // namespace megdnn + +#define MARK_USED_VAR \ + MEGDNN_MARK_USED_VAR(n + ci + hi + wi + co + fh + fw + ho + wo + ph + pw + \ + sh + sw + dh + dw); + +#define UNPACK_CONV_PARAMETER(_filter_meta, _param) \ + size_t ph = _param.pad_h, pw = _param.pad_w; \ + size_t sh = _param.stride_h, sw = _param.stride_w; \ + size_t dh = _param.dilate_h, dw = _param.dilate_w; \ + size_t fh = _filter_meta.spatial[0], fw = _filter_meta.spatial[1]; + +#define UNPACK_CONV_BIAS_NCHW4_PARAM(_src, _filter_meta, _dst, _param) \ + using Format = param::ConvBias::Format; \ + megdnn_assert(_param.format == Format::NCHW4); \ + size_t n = (_src)[0], ci = (_src)[1] * 4, hi = (_src)[2], wi = (_src)[3]; \ + size_t co = (_dst)[1] * 4, ho = (_dst)[2], wo = (_dst)[3]; \ + UNPACK_CONV_PARAMETER(_filter_meta, _param); \ + MARK_USED_VAR + +#define UNPACK_CONV_BIAS_CHWN4_PARAM(_src, _filter_meta, _dst, _param) \ + using Format = param::ConvBias::Format; \ + megdnn_assert(_param.format == Format::CHWN4); \ + size_t ci = (_src)[0] * 4, hi = (_src)[1], wi = (_src)[2], n = (_src)[3]; \ + size_t co = (_dst)[0] * 4, ho = (_dst)[1], wo = (_dst)[2]; \ + UNPACK_CONV_PARAMETER(_filter_meta, _param); \ + MARK_USED_VAR + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv.cpp b/dnn/src/cuda/conv_bias/cudnn_conv.cpp new file mode 100644 index 00000000..3e2f1388 --- /dev/null +++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp @@ -0,0 +1,120 @@ +/** + * \file dnn/src/cuda/conv_bias/cudnn_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/utils.h" +#include "src/common/conv_bias.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + } + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + + if (!is_cudnn_supported(conv_args)) + return false; + CUDNNForwardDescs D; + conv_args.init_conv_desc(D); + + size_t workspace_size; + auto status = cudnnGetConvolutionForwardWorkspaceSize( + conv_args.handle->cudnn_handle(), D.src_desc.desc, + D.filter_desc.desc, D.conv_desc.conv_desc, D.dst_desc.desc, + m_cudnn_enum, &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + SmallVector sizes; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + + CUDNNForwardDescs D; + conv_args.init_conv_desc(D); + + size_t conv_workspace_size; + auto status = cudnnGetConvolutionForwardWorkspaceSize( + conv_args.handle->cudnn_handle(), D.src_desc.desc, + D.filter_desc.desc, D.conv_desc.conv_desc, D.dst_desc.desc, + m_cudnn_enum, &conv_workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv fwd get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + sizes.insert(sizes.begin(), conv_workspace_size); + return {ptr, std::move(sizes)}; +} + +size_t ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + ExecArgs conv_args = args; + conv_args.dst_tensor = &conv_dst_tensor; + conv_args.dst_layout = &conv_dst_tensor.layout; + + { + CUDNNForwardDescs D; + conv_args.init_conv_desc(D); + auto conv_workspace = bundle.get_workspace(0); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionForward( + conv_args.handle->cudnn_handle(), &alpha, D.src_desc.desc, + conv_args.src_tensor->raw_ptr, D.filter_desc.desc, + conv_args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, + m_cudnn_enum, conv_workspace.raw_ptr, conv_workspace.size, + &beta, D.dst_desc.desc, conv_args.dst_tensor->raw_ptr); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv fwd failed: %s; info: %s", cudnnGetErrorString(status), + conv_args.to_string().c_str()); + } + + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp new file mode 100644 index 00000000..09efb160 --- /dev/null +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -0,0 +1,231 @@ +/** + * \file dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/oprs/general.h" + +#include "./algo.h" + +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim == 0 || + args.bias_layout->eq_shape(*args.dst_layout)) + return false; + auto&& param = args.opr->param(); + if (param.format == param::ConvBias::Format::NCHW && + (param.dilate_h != 1 || param.dilate_w != 1) && + m_cudnn_enum == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { + auto&& device_prop = current_device_prop(); + // Dilated convbias in NCHW format produces wrong result on Pascal + // Architecture, so we disable the algo here. + if (device_prop.major == 6) { + return false; + } + } + + if (param.format == param::ConvBias::Format::NCHW8 || + param.format == param::ConvBias::Format::CHWN4) + return false; + if (param.format == param::ConvBias::Format::NCHW32) { + auto&& filter_meta = args.filter_meta; + // NCHW32 layout only support group = 1 + if (filter_meta.group != 1) + return false; + // The data type (CUDNN_DATA_INT8x32) can only be used with algo + // "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", for details, see + // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html + if (m_cudnn_enum != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) + return false; + // check cudnn version + if (CUDNN_VERSION < 7500) + return false; + // sm version + auto&& device_prop = current_device_prop(); + if (device_prop.major < 7 || + (device_prop.major == 7 && device_prop.minor < 5)) + return false; + } + + CUDNNForwardDescs D; + + if (CUDNN_VERSION < 7401) + return false; + + args.init_conv_bias_desc(D); + switch (args.nonlinear_mode) { + case param::ConvBias::NonlineMode::RELU: + break; + case param::ConvBias::NonlineMode::SIGMOID: + // forbits sigmoid for quantized + if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) + return false; + MEGDNN_FALLTHRU // XXX: why? + case param::ConvBias::NonlineMode::IDENTITY + : if (m_cudnn_enum != + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { + // cudnn require algo to + // CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM + // when activation if IDENTITY + return false; + } + break; + case param::ConvBias::NonlineMode::H_SWISH: + return false; + default: + megdnn_throw(megdnn_mangle("unsupported NonlineMode")); + } + size_t workspace_size; + auto status = cudnnGetConvolutionForwardWorkspaceSize( + args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc, + D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum, + &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::get_workspace_in_bytes( + const SizeArgs& args) const { + CUDNNForwardDescs D; + + args.init_conv_bias_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionForwardWorkspaceSize( + args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc, + D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum, + &workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv fwd get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && + args.src_layout->dtype.category() != DTypeCategory::FLOAT) { + // cudnn require bias to be float when executing CONFIG_INT + // convert bias to float if bias is not float at first + workspace_size += sizeof(float) * args.bias_layout->span().dist_elem(); + } + return workspace_size; +} + +void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec( + const ExecArgs& args) const { +#if CUDNN_MAJOR < 7 + megdnn_throw(megdnn_mangle("ConvBias require cudnn 7.0 or higher")); +#else + megdnn_assert(cudnnGetVersion() >= 7401); + CUDNNForwardDescs D; + args.init_conv_bias_desc(D); + float alpha = 1.0f, beta = 0.0f; + if (args.z_layout->ndim > 0) + beta = 1.0f; + + auto get_scale = [](const DType& dtype) -> float { + megdnn_assert(dtype.category() == DTypeCategory::QUANTIZED); + switch (dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + return dtype.param<_dt>().scale; + MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) +#undef cb + default: + megdnn_assert_internal(0); + } + }; + + megdnn_assert(args.src_layout->dtype.category() == + args.dst_layout->dtype.category() && + args.src_tensor->layout.dtype.category() == + args.filter_layout->dtype.category()); + + if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) { + auto expected_bias_scale = get_scale(args.src_layout->dtype) * + get_scale(args.filter_layout->dtype); + alpha = expected_bias_scale / get_scale(args.dst_layout->dtype); + if (args.z_layout->ndim > 0) { + beta = get_scale(args.z_layout->dtype) / + get_scale(args.dst_layout->dtype); + } + if (args.bias_layout->dtype.category() == DTypeCategory::QUANTIZED) { + megdnn_assert(fabs(expected_bias_scale - + get_scale(args.bias_layout->dtype)) < 1e-4); + } + } + + auto workspace_ptr = args.workspace.raw_ptr; + auto workspace_size = args.workspace.size; + auto bias_ptr = args.bias_tensor->raw_ptr; + if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() && + args.src_layout->dtype.category() != DTypeCategory::FLOAT) { + auto cvt = args.handle->create_operator(); + auto float_bias_layout = *args.bias_layout; + auto converted_bias_layout = *args.bias_layout; + converted_bias_layout.dtype = dtype::QuantizedS32(alpha); + float_bias_layout.dtype = dtype::Float32(); + auto bias_size_in_bytes = float_bias_layout.span().dist_byte(); + megdnn_assert(args.workspace.size >= bias_size_in_bytes); + cvt->exec({args.bias_tensor->raw_ptr, converted_bias_layout}, + TensorND{workspace_ptr, float_bias_layout}); + + bias_ptr = workspace_ptr; + workspace_ptr += bias_size_in_bytes; + workspace_size -= bias_size_in_bytes; + } + + cudnnStatus_t status; + if (args.z_layout->ndim == 0) { + status = cudnnConvolutionBiasActivationForward( + args.handle->cudnn_handle(), &alpha, D.src_desc.desc, + args.src_tensor->raw_ptr, D.filter_desc.desc, + args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, + m_cudnn_enum, workspace_ptr, workspace_size, &beta, + D.dst_desc.desc, args.dst_tensor->raw_ptr, D.bias_desc.desc, + bias_ptr, D.conv_desc.act_desc, D.dst_desc.desc, + args.dst_tensor->raw_ptr); + } else { + status = cudnnConvolutionBiasActivationForward( + args.handle->cudnn_handle(), &alpha, D.src_desc.desc, + args.src_tensor->raw_ptr, D.filter_desc.desc, + args.filter_tensor->raw_ptr, D.conv_desc.conv_desc, + m_cudnn_enum, workspace_ptr, workspace_size, &beta, + D.z_desc.desc, args.z_tensor->raw_ptr, D.bias_desc.desc, + bias_ptr, D.conv_desc.act_desc, D.dst_desc.desc, + args.dst_tensor->raw_ptr); + } + + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv fwd failed: %s; info: %s, algo %s", + cudnnGetErrorString(status), args.to_string().c_str(), + name()); + // Noline + switch (args.nonlinear_mode) { + case param::ConvBias::NonlineMode::RELU: + break; + case param::ConvBias::NonlineMode::SIGMOID: { + megdnn_assert(args.dst_layout->dtype.category() != + DTypeCategory::QUANTIZED); + auto&& elem_opr = args.handle->create_operator(); + elem_opr->param().mode = Elemwise::Param::Mode::SIGMOID; + elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor)); + break; + } + case param::ConvBias::NonlineMode::IDENTITY: + break; + default: + megdnn_throw(megdnn_mangle("unsupported NonlineMode")); + } +#endif +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/group_conv.cpp b/dnn/src/cuda/conv_bias/group_conv.cpp new file mode 100644 index 00000000..cfcf60a3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/group_conv.cpp @@ -0,0 +1,165 @@ +/** + * \file dnn/src/cuda/conv_bias/group_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +void ConvBiasForwardImpl::AlgoGroupConvGeneral::modify_size_args( + ConvBiasForwardImpl::AlgoBase::SizeArgs& args, TensorLayout& src_pg, + TensorLayout& dst_pg, TensorLayout& bias_pg) { + src_pg = *args.src_layout; + dst_pg = *args.dst_layout; + bias_pg = *args.bias_layout; + auto nr_grp = args.filter_meta.group; + args.filter_meta.group = 1; + size_t c_pos; + if (args.filter_meta.format == Param::Format::NCHW || + args.filter_meta.format == Param::Format::NCHW4) { + c_pos = 1; + } else { + megdnn_assert(args.filter_meta.format == Param::Format::NHWC, + "invalid conv format"); + c_pos = 3; + } + src_pg.shape[c_pos] /= nr_grp; + dst_pg.shape[c_pos] /= nr_grp; + bias_pg.ndim = 0; + args.src_layout = &src_pg; + args.dst_layout = &dst_pg; + args.bias_layout = &bias_pg; + args.nonlinear_mode = Param::NonlineMode::IDENTITY; +} + +ConvBiasForwardImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral(AlgoBase* impl) + : m_impl{impl} { + m_name = ConvBiasForward::algo_name( + ssprintf("%s:%s", "CUDA:GROUP_CONV", impl->name()), {}); +} + +bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0 || args.filter_meta.group <= 1) + return false; + auto&& param = args.opr->param(); + if (param.format == param::ConvBias::Format::NCHW8 || + param.format == param::ConvBias::Format::CHWN4 || + param.format == param::ConvBias::Format::NCHW32) + return false; + + auto sub_args = args; + TensorLayout src_pg, dst_pg, bias_pg; + modify_size_args(sub_args, src_pg, dst_pg, bias_pg); + return m_impl->is_available(sub_args); +} + +WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + SmallVector sizes; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + auto sub_args = args; + sub_args.dst_layout = &dst_layout; + TensorLayout src_pg, dst_pg, bias_pg; + modify_size_args(sub_args, src_pg, dst_pg, bias_pg); + sizes.insert(sizes.begin(), + m_impl->get_workspace_in_bytes(sub_args)); + return {ptr, std::move(sizes)}; +} + +size_t ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( + const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + { + auto sub_args = args; + sub_args.dst_tensor = &conv_dst_tensor; + sub_args.dst_layout = &conv_dst_tensor.layout; + TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, tbias{*args.bias_tensor}; + SmallVector flt_shape(0); + std::vector flt_stride(0); + size_t idx = 0; + // check if the first dim is group + if (args.filter_tensor->layout.ndim > args.src_layout->ndim) + ++idx; + for (; idx < args.filter_tensor->layout.ndim; ++idx) { + flt_shape.push_back(args.filter_tensor->layout[idx]); + flt_stride.push_back(args.filter_tensor->layout.stride[idx]); + } + TensorND tflt{args.filter_tensor->raw_ptr, + TensorLayout{flt_shape, flt_stride, + args.filter_tensor->layout.dtype, + args.filter_tensor->layout.format}}; + + modify_size_args(sub_args, tsrc.layout, tdst.layout, tbias.layout); + sub_args.src_tensor = &tsrc; + sub_args.dst_tensor = &tdst; + sub_args.filter_tensor = &tflt; + sub_args.bias_tensor = &tbias; + + size_t c_pos; + if (args.filter_meta.format == Param::Format::NCHW || + args.filter_meta.format == Param::Format::NCHW4) { + c_pos = 1; + } else { + megdnn_assert(args.filter_meta.format == Param::Format::NHWC, + "invalid conv format"); + c_pos = 3; + } + + auto grp = args.filter_meta.group; + + auto&& fm = args.filter_meta; + auto strd_src = tsrc.layout.stride[c_pos] * fm.icpg * + tsrc.layout.dtype.size(), + strd_dst = tdst.layout.stride[c_pos] * fm.ocpg * + tdst.layout.dtype.size(), + strd_flt = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * + tflt.layout.dtype.size(); + if (args.filter_meta.format == Param::Format::NCHW4) { + strd_src >>= 2; + strd_dst >>= 2; + } + for (uint32_t g = 0; g < grp; ++g) { + m_impl->exec(sub_args); + incr_voidp(tsrc.raw_ptr, strd_src); + incr_voidp(tdst.raw_ptr, strd_dst); + incr_voidp(tflt.raw_ptr, strd_flt); + } + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/helper.cpp b/dnn/src/cuda/conv_bias/helper.cpp new file mode 100644 index 00000000..e36eb88a --- /dev/null +++ b/dnn/src/cuda/conv_bias/helper.cpp @@ -0,0 +1,227 @@ +/** + * \file dnn/src/cuda/conv_bias/helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/helper.h" + +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +ConvBiasDesc::ConvBiasDesc() { + cudnn_check(cudnnCreateActivationDescriptor(&act_desc)); + cudnn_check(cudnnCreateConvolutionDescriptor(&conv_desc)); +#if CUDNN_VERSION >= 7000 + cudnn_check(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH)); +#endif +} + +ConvBiasDesc::~ConvBiasDesc() { + cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc)); + cudnn_check(cudnnDestroyActivationDescriptor(act_desc)); +} + +void ConvBiasDesc::set_conv_bias(DType data_type, const param::ConvBias& param, + size_t nr_group) { +#if CUDNN_VERSION < 7100 + megdnn_throw(megdnn_mangle( + "ConvBias(CUDNN_ACTIVATION_IDENTITY) require cudnn 7.1 or higher")); +#else + cudnnConvolutionMode_t mode; + using Param = param::ConvBias; + switch (param.mode) { + case Param::Mode::CROSS_CORRELATION: + mode = CUDNN_CROSS_CORRELATION; + break; + case Param::Mode::CONVOLUTION: + mode = CUDNN_CONVOLUTION; + break; + default: + megdnn_throw(megdnn_mangle("conv mode must be conv or xcorr.")); + } + cudnn_check(cudnnSetConvolutionGroupCount(conv_desc, nr_group)); + cudnnDataType_t compute_type; + switch (data_type.category()) { + case DTypeCategory::FLOAT: + compute_type = CUDNN_DATA_FLOAT; + break; + case DTypeCategory::INT: + case DTypeCategory::QUANTIZED: + compute_type = CUDNN_DATA_INT32; + break; + default: + megdnn_throw(megdnn_mangle("unspport data type for conv bias")); + } + if (data_type.enumv() == DTypeEnum::Float16) { + auto comp_mode = param.compute_mode; + compute_type = get_compute_type_fp16(comp_mode); + } + cudnn_check(cudnnSetConvolution2dDescriptor( + conv_desc, param.pad_h, param.pad_w, param.stride_h, param.stride_w, + param.dilate_h, param.dilate_w, mode, compute_type)); + + switch (param.nonlineMode) { + case Param::NonlineMode::IDENTITY: + case Param::NonlineMode::SIGMOID: + case Param::NonlineMode::H_SWISH: + cudnn_check(cudnnSetActivationDescriptor( + act_desc, CUDNN_ACTIVATION_IDENTITY, + CUDNN_NOT_PROPAGATE_NAN, 0)); + break; + case Param::NonlineMode::RELU: + cudnn_check(cudnnSetActivationDescriptor( + act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, + 0)); + break; + default: + megdnn_throw(megdnn_mangle("unsupported non linear mode")); + } +#endif +} + +void ConvBiasDesc::set_conv(DType data_type, const param::ConvBias& param, + const size_t nr_group) { + using Param = param::ConvBias; + cudnnConvolutionMode_t mode; + switch (param.mode) { + case Param::Mode::CROSS_CORRELATION: + mode = CUDNN_CROSS_CORRELATION; + break; + case Param::Mode::CONVOLUTION: + mode = CUDNN_CONVOLUTION; + break; + default: + megdnn_throw(megdnn_mangle("conv mode must be conv or xcorr.")); + } + cudnnDataType_t compute_type; + MEGDNN_MARK_USED_VAR(compute_type); + if (data_type.enumv() == DTypeEnum::Float32) { + // FLOAT_CONFIG + compute_type = CUDNN_DATA_FLOAT; + } else if (data_type.enumv() == DTypeEnum::Float16) { + auto comp_mode = param.compute_mode; + compute_type = get_compute_type_fp16(comp_mode); +#if CUDNN_MAJOR >= 7 + } else if (data_type.category() == DTypeCategory::INT || + data_type.category() == DTypeCategory::QUANTIZED) { + compute_type = CUDNN_DATA_INT32; +#endif + } else { + megdnn_throw(megdnn_mangle("unspport data type for conv bias")); + } +#if CUDNN_MAJOR >= 7 + cudnn_check(cudnnSetConvolutionGroupCount(conv_desc, nr_group)); +#else + megdnn_assert(nr_group == 1); +#endif + +#if CUDNN_MAJOR >= 6 + cudnn_check(cudnnSetConvolution2dDescriptor( + conv_desc, param.pad_h, param.pad_w, param.stride_h, param.stride_w, + param.dilate_h, param.dilate_w, mode, compute_type)); +#else + cudnn_check(cudnnSetConvolution2dDescriptor( + conv_desc, param.pad_h, param.pad_w, param.stride_h, param.stride_w, + param.dilate_h, param.dilate_w, mode)); +#endif +} + +namespace conv_bias { + +bool is_cudnn_supported(const BiasForwardSizeArgs& args) { + // CUDNN_STATUS_EXECUTION_FAILED on Tegra K1, so disable CUDNN + // on Tegra K1. + if (args.handle->is_tegra_k1()) + return false; + + // TODO: We only support NCHW format now. It seems cuDNN provides support + // for NHWC as well. + if (args.filter_meta.format == param::Convolution::Format::NCHW4) { + if (args.dst_layout->dtype.enumv() != DTypeEnum::Int8 && + args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { + return false; + } + } else if (args.filter_meta.format != param::Convolution::Format::NCHW) { + return false; + } + auto& fm = args.filter_meta; + bool supported = true; + supported &= (fm.spatial_ndim == 2); +#if CUDNN_VERSION < 7000 + supported &= (fm.group == 1); +#endif +#if CUDNN_VERSION < 7500 + supported &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); +#endif + return supported; +} + +bool check_bias_share_in_channel(const TensorLayout& bias, + const param::ConvBias::Format format) { + bool share_in_channel = false; + if (format == param::ConvBias::Format::NCHW) { + share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && + bias[3] == 1); + } else if (format == param::ConvBias::Format::NHWC) { + share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && + bias[2] == 1); + } else if (format == param::ConvBias::Format::NCHW4 || + format == param::ConvBias::Format::NCHW8 || + format == param::ConvBias::Format::NCHW32) { + share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && + bias[3] == 1); + } else if (format == param::ConvBias::Format::NHWCD4) { + share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && + bias[3] == 1); + } else { + megdnn_assert(format == param::ConvBias::Format::CHWN4); + share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && + bias[3] == 1); + } + return share_in_channel; +} + +WorkspaceBundle matmul_get_workspace_bundle(const BiasForwardSizeArgs& args) { + auto dtype = args.src_layout->dtype; + auto&& fm = args.filter_meta; + megdnn_assert(fm.group == 1); + auto N = args.src_layout->shape[0]; + auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; + auto OH = args.dst_layout->shape[2], OW = args.dst_layout->shape[3]; + SmallVector sizes{dtype.size() * args.dst_layout->total_nr_elems(), + dtype.size() * IC * FH * FW * OH * OW * N}; + if (args.filter_meta.should_flip) { + sizes.push_back(dtype.size() * OC * IC * FH * FW); + } + return {nullptr, std::move(sizes)}; +} + +void flip_filter(const BiasForwardSizeArgs& args, const Workspace& workspace, + void*& raw_ptr) { + auto&& fm = args.filter_meta; + megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2); + auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; + auto dtype = fm.dtype; + megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW); + + TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}}, + dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout}; + dst.layout.stride[2] = -dst.layout.stride[2]; + dst.layout.stride[3] = -dst.layout.stride[3]; + args.handle->relayout_opr()->exec(src, dst); + raw_ptr = workspace.raw_ptr; +} + +} // conv_bias + +} // cuda +} // megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/helper.h b/dnn/src/cuda/conv_bias/helper.h new file mode 100644 index 00000000..7be32e6d --- /dev/null +++ b/dnn/src/cuda/conv_bias/helper.h @@ -0,0 +1,116 @@ +/** + * \file dnn/src/cuda/conv_bias/helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "./opr_impl.h" +#include "src/cuda/handle.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/common/utils.h" +#include "src/common/algo_chooser.h" + +namespace megdnn { +namespace cuda { + +class ConvBiasDesc { +public: + ConvBiasDesc(); + void set_conv_bias(DType data_type, const param::ConvBias& param, + const size_t nr_group); + void set_conv(DType data_type, const param::ConvBias& param, + const size_t nr_group); + ~ConvBiasDesc(); + cudnnConvolutionDescriptor_t conv_desc; + cudnnActivationDescriptor_t act_desc; +}; + +namespace conv_bias { + using CanonizedFilterMeta = ConvBiasForward::CanonizedFilterMeta; + + //! conv size descriptor in the forward view + struct BiasForwardSizeArgs { + HandleImpl *handle; + const TensorLayout *src_layout; + const TensorLayout *filter_layout; + const TensorLayout *bias_layout; + const TensorLayout *z_layout; + CanonizedFilterMeta filter_meta; + const TensorLayout *dst_layout; + param::ConvBias::NonlineMode nonlinear_mode; + }; + + //! whether cudnn is supported for a filter meta + bool is_cudnn_supported(const BiasForwardSizeArgs& args); + + //! get workspace bundle for matmul algo + WorkspaceBundle matmul_get_workspace_bundle( + const BiasForwardSizeArgs& args); + + /*! + * \brief flip conv filter + * + * Flip conv filter pointed by \p raw_ptr, store result in workspace, and + * change \p raw_ptr to workspace. + */ + void flip_filter(const BiasForwardSizeArgs& args, + const Workspace& workspace, void*& raw_ptr); + + struct CUDNNForwardDescs { + TensorDesc src_desc, dst_desc, bias_desc, z_desc; + FilterDesc filter_desc; + ConvBiasDesc conv_desc; + + void set_conv_bias(const TensorLayout& src, + const CanonizedFilterMeta& filter, + const TensorLayout& dst, const TensorLayout& bias, + const TensorLayout& z, + const param::ConvBias& param) { + src_desc.set(src, param.format); + filter_desc.set(filter); + if (z.ndim > 0) { + z_desc.set(z, param.format); + } + dst_desc.set(dst, param.format); + conv_desc.set_conv_bias(src.dtype, param, filter.group); + + // cudnn requires the bias to be float tensor. + auto float_bias_layout = bias; + float_bias_layout.dtype = dtype::Float32(); + if (param.format == param::ConvBias::Format::NCHW4 || + param.format == param::ConvBias::Format::NCHW32) { + // cudnn require bias to be NCHW, not NCHW4. + float_bias_layout = float_bias_layout.reshape( + {float_bias_layout[0], + float_bias_layout[1] * float_bias_layout[4], + float_bias_layout[2], float_bias_layout[3]}); + bias_desc.set(float_bias_layout); + } else { + bias_desc.set(float_bias_layout, param.format); + } + } + + void set_conv(const TensorLayout& src, + const CanonizedFilterMeta& filter, + const TensorLayout& dst, const param::ConvBias& param) { + src_desc.set(src, param.format); + filter_desc.set(filter); + dst_desc.set(dst, param.format); + conv_desc.set_conv(src.dtype, param, filter.group); + } + }; + + bool check_bias_share_in_channel(const TensorLayout& bias, + const param::ConvBias::Format format); + +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp new file mode 100644 index 00000000..1506ff6e --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp @@ -0,0 +1,209 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_dp4a.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, Epilogue epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream) { + void (*kern_wrapper)(const int8_t*, const int8_t*, BiasVisitor, Epilogue, + const ConvParam&, float, float, cudaStream_t); + using namespace conv_bias_int8; + // for turing + if (is_compute_capability_required(7, 5)) { + bool use_ld_64bit = param.n % 2 == 0; + bool use_unroll_width = + param.n < 128 && (param.wo % 2 == 0 || param.wo % 3 == 0); + if (use_ld_64bit) { + if (use_unroll_width) { + kern_wrapper = + do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width< + BiasVisitor, Epilogue>; + } else { + kern_wrapper = + do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit< + BiasVisitor, Epilogue>; + } + } else { + if (use_unroll_width) { + kern_wrapper = + do_conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width< + BiasVisitor, Epilogue>; + } else { + kern_wrapper = + do_conv_bias_int8_implicit_gemm_cdiv4hwn4; + } + } + } else { // volta or lower + if (param.n % 2 == 0) { + kern_wrapper = do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit< + BiasVisitor, Epilogue>; + } else { + kern_wrapper = + do_conv_bias_int8_implicit_gemm_cdiv4hwn4; + } + } + megdnn_assert(kern_wrapper != nullptr); + return kern_wrapper(d_src, d_filter, bias_visitor, epilogue, param, alpha, + beta, stream); +} +} // namespace + +bool ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), + param.format)) + return false; + if (param.format != Format::CHWN4) + return false; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout->dtype, + filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, + dst_dtype = args.dst_layout->dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // only support sm_61 or later, platform should have fast native int8 + // support + available &= is_compute_capability_required(6, 1); + return available; +} + +size_t +ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm::get_workspace_in_bytes( + const SizeArgs& /* args */) const { + return 0; +} + +void ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + auto&& stream = cuda_stream(args.opr->handle()); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout->ndim > 0) { + z_dev_ptr = args.z_tensor->compatible_ptr(); + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + dispatch_nonlinear_mode( + args.src_tensor->compatible_ptr(), + args.filter_tensor->compatible_ptr(), bias_visitor, + z_dev_ptr, args.dst_tensor->compatible_ptr(), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode); +} + +template +void ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm:: + dispatch_nonlinear_mode(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, const int8_t* d_z, + int8_t* d_dst, const ConvParam& param, + float alpha, float beta, float gamma, + float scale, cudaStream_t stream, + param::ConvBias::NonlineMode nonlinear_mode) { + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + Layout layout; + layout.init(param.n, param.co, param.ho, param.wo); +#define DISPATCH_CONV_INT8_EPILOGUE(_act_op) \ + do { \ + IConvEpilogue<_act_op> epilogue{d_dst, \ + d_z, \ + layout.batch_stride, \ + layout.channel_stride / 4, \ + layout.height_stride, \ + layout.width_stride, \ + gamma, \ + _act_op{scale, 1.f / scale}}; \ + dispatch_kernel>( \ + d_src, d_filter, bias_visitor, epilogue, param, alpha, beta, \ + stream); \ + return; \ + } while (0) +#define cb(_nonline_mode) \ + if (static_cast(nonlinear_mode) == NonlineMode::_nonline_mode) { \ + DISPATCH_CONV_INT8_EPILOGUE(Activation); \ + } + MEGDNN_FOREACH_NONLINE_MODE(cb); + megdnn_assert(false, "unsupported nonlinear mode for conv bias operator"); +#undef cb +#undef DISPATCH_CONV_INT8_EPILOGUE +} + +#define INST(_visitor) \ + template void ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm:: \ + dispatch_nonlinear_mode<_visitor>( \ + const int8_t* d_src, const int8_t* d_filter, \ + _visitor bias_visitor, const int8_t* d_z, int8_t* d_dst, \ + const ConvParam& param, float alpha, float beta, \ + float gamma, float scale, cudaStream_t stream, \ + param::ConvBias::NonlineMode nonlinear_mode); + +INST(PerChannelBiasVisitor); + +#undef INST + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp new file mode 100644 index 00000000..cdfc7e5d --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp @@ -0,0 +1,217 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10000 +namespace { +using MMATileSize = + ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize; + +template +void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, Epilogue epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream, MMATileSize mma_tile_size) { + void (*kern_wrapper)(const int8_t*, const int8_t*, BiasVisitor, Epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream); + using namespace conv_bias_int8; + // for turing + switch (mma_tile_size) { + case MMATileSize::IMMA8x32x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4< + BiasVisitor, Epilogue>; + break; + case MMATileSize::IMMA32x8x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4< + BiasVisitor, Epilogue>; + break; + case MMATileSize::IMMA16x16x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4< + BiasVisitor, Epilogue>; + break; + default: + megdnn_assert(false, "invalid mma tile size"); + } + return kern_wrapper(d_src, d_filter, bias_visitor, epilogue, param, alpha, + beta, stream); +} +}; // namespace + +bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), + param.format)) + return false; + if (param.format != Format::CHWN4) + return false; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout->dtype, + filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, + dst_dtype = args.dst_layout->dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // check layout + available &= (ci % 16 == 0); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // only support sm_75 or later, platform should have tensorcore int8 + // support + available &= is_compute_capability_required(7, 5); + return available; +} + +size_t +ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::get_workspace_in_bytes( + const SizeArgs& /* args */) const { + return 0; +} + +void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + auto&& stream = cuda_stream(args.opr->handle()); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout->ndim > 0) { + z_dev_ptr = args.z_tensor->compatible_ptr(); + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + dispatch_nonlinear_mode( + args.src_tensor->compatible_ptr(), + args.filter_tensor->compatible_ptr(), bias_visitor, + z_dev_ptr, args.dst_tensor->compatible_ptr(), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode, + m_mma_tile_size); +} + +template +void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm:: + dispatch_nonlinear_mode(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, int8_t* d_z, + int8_t* d_dst, const ConvParam& param, + float alpha, float beta, float gamma, + float scale, cudaStream_t stream, + param::ConvBias::NonlineMode nonlinear_mode, + MMATileSize mma_tile_size) { + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + Layout layout; + layout.init(param.n, param.co, param.ho, param.wo); +#define DISPATCH_CONV_IMMA_EPILOGUE(_act_op) \ + do { \ + IConvEpilogue<_act_op> epilogue{d_dst, \ + d_z, \ + layout.batch_stride, \ + layout.channel_stride / 4, \ + layout.height_stride, \ + layout.width_stride, \ + gamma, \ + _act_op{scale, 1.f / scale}}; \ + dispatch_kernel>( \ + d_src, d_filter, bias_visitor, epilogue, param, alpha, beta, \ + stream, mma_tile_size); \ + return; \ + } while (0) +#define cb(_nonline_mode) \ + if (static_cast(nonlinear_mode) == NonlineMode::_nonline_mode) { \ + DISPATCH_CONV_IMMA_EPILOGUE(Activation); \ + } + MEGDNN_FOREACH_NONLINE_MODE(cb); + megdnn_assert(false, "unsupported nonlinear mode for conv bias operator"); +#undef cb +#undef DISPATCH_CONV_IMMA_EPILOGUE +} + +#define INST(_visitor) \ + template void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm:: \ + dispatch_nonlinear_mode<_visitor>( \ + const int8_t* d_src, const int8_t* d_filter, \ + _visitor bias_visitor, int8_t* d_z, int8_t* d_dst, \ + const ConvParam& param, float alpha, float beta, \ + float gamma, float scale, cudaStream_t stream, \ + param::ConvBias::NonlineMode nonlinear_mode, \ + MMATileSize mma_tile_size); + +INST(PerChannelBiasVisitor); + +std::string ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::to_string( + MMATileSize mma_tile_size) { + switch (mma_tile_size) { + case MMATileSize::IMMA8x32x16: + return "mma8x32x16"; + case MMATileSize::IMMA32x8x16: + return "mma32x8x16"; + case MMATileSize::IMMA16x16x16: + return "mma16x16x16"; + default: + megdnn_assert_internal(false); + } +} + +#undef INST +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp new file mode 100644 index 00000000..3cc00de0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp @@ -0,0 +1,218 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_reorder_filter.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10000 +namespace { +using MMATileSize = + ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize; + +template +void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, Epilogue epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream, MMATileSize mma_tile_size) { + void (*kern_wrapper)(const int8_t*, const int8_t*, BiasVisitor, Epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream); + using namespace conv_bias_int8; + switch (mma_tile_size) { + case MMATileSize::IMMA8x32x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter< + BiasVisitor, Epilogue>; + break; + case MMATileSize::IMMA32x8x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter< + BiasVisitor, Epilogue>; + break; + case MMATileSize::IMMA16x16x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter< + BiasVisitor, Epilogue>; + break; + default: + megdnn_assert(false, "invalid mma tile size"); + } + return kern_wrapper(d_src, d_filter, bias_visitor, epilogue, param, alpha, + beta, stream); +} + +template +void dispatch_nonlinear_mode(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, int8_t* d_z, + int8_t* d_dst, const ConvParam& param, float alpha, + float beta, float gamma, float scale, + cudaStream_t stream, + param::ConvBias::NonlineMode nonlinear_mode, + MMATileSize mma_tile_size) { + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + Layout layout; + layout.init(param.n, param.co, param.ho, param.wo); +#define DISPATCH_CONV_IMMA_EPILOGUE(_act_op) \ + do { \ + IConvEpilogue<_act_op> epilogue{d_dst, \ + d_z, \ + layout.batch_stride, \ + layout.channel_stride / 4, \ + layout.height_stride, \ + layout.width_stride, \ + gamma, \ + _act_op{scale, 1.f / scale}}; \ + dispatch_kernel>( \ + d_src, d_filter, bias_visitor, epilogue, param, alpha, beta, \ + stream, mma_tile_size); \ + return; \ + } while (0) +#define cb(_nonline_mode) \ + if (static_cast(nonlinear_mode) == NonlineMode::_nonline_mode) { \ + DISPATCH_CONV_IMMA_EPILOGUE(Activation); \ + } + MEGDNN_FOREACH_NONLINE_MODE(cb); + megdnn_assert(false, "unsupported nonlinear mode for conv bias operator"); +#undef cb +#undef DISPATCH_CONV_IMMA_EPILOGUE +} + +#define INST(_visitor) \ + template void dispatch_nonlinear_mode<_visitor>( \ + const int8_t* d_src, const int8_t* d_filter, \ + _visitor bias_visitor, int8_t* d_z, int8_t* d_dst, \ + const ConvParam& param, float alpha, float beta, float gamma, \ + float scale, cudaStream_t stream, \ + param::ConvBias::NonlineMode nonlinear_mode, \ + MMATileSize mma_tile_size); + +INST(PerChannelBiasVisitor); + +}; // namespace + +bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter:: + is_available(const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), + param.format)) + return false; + if (param.format != Format::CHWN4) + return false; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout->dtype, + filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, + dst_dtype = args.dst_layout->dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // check layout + available &= (ci % 16 == 0); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // only support sm_75 or later, platform should have tensorcore int8 + // support + available &= is_compute_capability_required(7, 5); + return available; +} + +size_t ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter:: + get_workspace_in_bytes(const SizeArgs& args) const { + return args.filter_layout->span().dist_byte(); +} + +void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // reorder filter + { + TensorLayout in = *(args.filter_layout); + TensorLayout out = {{ci / 16, 4, fh, fw, co, 4}, in.dtype}; + out.stride[0] = 16 * co * fh * fw; + out.stride[1] = 4; + out.stride[2] = fw * co * 16; + out.stride[3] = co * 16; + out.stride[4] = 16; + out.stride[5] = 1; + TensorND ts_in, ts_out; + ts_in.layout = in, ts_out.layout = out; + ts_in.raw_ptr = args.filter_tensor->raw_ptr, + ts_out.raw_ptr = args.workspace.raw_ptr; + args.opr->handle()->create_operator()->exec(ts_in, + ts_out); + } + + auto&& stream = cuda_stream(args.opr->handle()); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout->ndim > 0) { + z_dev_ptr = args.z_tensor->compatible_ptr(); + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + dispatch_nonlinear_mode( + args.src_tensor->compatible_ptr(), + reinterpret_cast(args.workspace.raw_ptr), bias_visitor, + z_dev_ptr, args.dst_tensor->compatible_ptr(), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode, + m_mma_tile_size); +} + +#undef INST +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp new file mode 100644 index 00000000..93fd5cd5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp @@ -0,0 +1,220 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_chwn4_imma_unroll_width.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10000 +namespace { +using MMATileSize = + ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize; + +template +void dispatch_kernel(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, Epilogue epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream, MMATileSize mma_tile_size) { + void (*kern_wrapper)(const int8_t*, const int8_t*, BiasVisitor, Epilogue, + const ConvParam& param, float alpha, float beta, + cudaStream_t stream); + using namespace conv_bias_int8; + switch (mma_tile_size) { + case MMATileSize::IMMA8x32x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width< + BiasVisitor, Epilogue>; + break; + case MMATileSize::IMMA32x8x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width< + BiasVisitor, Epilogue>; + break; + case MMATileSize::IMMA16x16x16: + kern_wrapper = + do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width< + BiasVisitor, Epilogue>; + break; + default: + megdnn_assert(false, "invalid mma tile size"); + } + return kern_wrapper(d_src, d_filter, bias_visitor, epilogue, param, alpha, + beta, stream); +} + +template +void dispatch_nonlinear_mode(const int8_t* d_src, const int8_t* d_filter, + BiasVisitor bias_visitor, int8_t* d_z, + int8_t* d_dst, const ConvParam& param, float alpha, + float beta, float gamma, float scale, + cudaStream_t stream, + param::ConvBias::NonlineMode nonlinear_mode, + MMATileSize mma_tile_size) { + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + Layout layout; + layout.init(param.n, param.co, param.ho, param.wo); +#define DISPATCH_CONV_IMMA_EPILOGUE(_act_op) \ + do { \ + IConvEpilogue<_act_op> epilogue{d_dst, \ + d_z, \ + layout.batch_stride, \ + layout.channel_stride / 4, \ + layout.height_stride, \ + layout.width_stride, \ + gamma, \ + _act_op{scale, 1.f / scale}}; \ + dispatch_kernel>( \ + d_src, d_filter, bias_visitor, epilogue, param, alpha, beta, \ + stream, mma_tile_size); \ + return; \ + } while (0) +#define cb(_nonline_mode) \ + if (static_cast(nonlinear_mode) == NonlineMode::_nonline_mode) { \ + DISPATCH_CONV_IMMA_EPILOGUE(Activation); \ + } + MEGDNN_FOREACH_NONLINE_MODE(cb); + megdnn_assert(false, "unsupported nonlinear mode for conv bias operator"); +#undef cb +#undef DISPATCH_CONV_IMMA_EPILOGUE +} + +#define INST(_visitor) \ + template void dispatch_nonlinear_mode<_visitor>( \ + const int8_t* d_src, const int8_t* d_filter, \ + _visitor bias_visitor, int8_t* d_z, int8_t* d_dst, \ + const ConvParam& param, float alpha, float beta, float gamma, \ + float scale, cudaStream_t stream, \ + param::ConvBias::NonlineMode nonlinear_mode, \ + MMATileSize mma_tile_size); + +INST(PerChannelBiasVisitor); + +}; // namespace + +bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth:: + is_available(const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), + param.format)) + return false; + if (param.format != Format::CHWN4) + return false; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout->dtype, + filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, + dst_dtype = args.dst_layout->dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // check batch size + available &= (n % 4 == 0); + // check layout + available &= (ci % 16 == 0); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // only support sm_75 or later, platform should have tensorcore int8 + // support + available &= is_compute_capability_required(7, 5); + return available; +} + +size_t ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth:: + get_workspace_in_bytes(const SizeArgs& args) const { + return args.filter_layout->span().dist_byte(); +} + +void ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_CHWN4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // reorder filter + { + TensorLayout in = *(args.filter_layout); + TensorLayout out = {{ci / 16, 4, fh, fw, co, 4}, in.dtype}; + out.stride[0] = 16 * co * fh * fw; + out.stride[1] = 4; + out.stride[2] = fw * co * 16; + out.stride[3] = co * 16; + out.stride[4] = 16; + out.stride[5] = 1; + TensorND ts_in, ts_out; + ts_in.layout = in, ts_out.layout = out; + ts_in.raw_ptr = args.filter_tensor->raw_ptr, + ts_out.raw_ptr = args.workspace.raw_ptr; + args.opr->handle()->create_operator()->exec(ts_in, + ts_out); + } + + auto&& stream = cuda_stream(args.opr->handle()); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout->ndim > 0) { + z_dev_ptr = args.z_tensor->compatible_ptr(); + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + dispatch_nonlinear_mode( + args.src_tensor->compatible_ptr(), + reinterpret_cast(args.workspace.raw_ptr), bias_visitor, + z_dev_ptr, args.dst_tensor->compatible_ptr(), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode, + m_mma_tile_size); +} + +#undef INST +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp new file mode 100644 index 00000000..d3c414b4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -0,0 +1,189 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" + +using namespace megdnn; +using namespace cuda; + +bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), + param.format)) + return false; + if (param.format != Format::NCHW4) + return false; + UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout->dtype, + filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, + dst_dtype = args.dst_layout->dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // only support sm_61 or later, platform should have fast native int8 + // support + available &= is_compute_capability_required(6, 1); + return available; +} + +WorkspaceBundle +ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + size_t ws_size_src = args.src_layout->span().dist_byte(); + size_t ws_size_filter = args.filter_layout->span().dist_byte(); + size_t ws_size_dst = args.dst_layout->span().dist_byte(); + if (args.z_layout->ndim > 0) { + size_t ws_size_z = args.z_layout->span().dist_byte(); + return WorkspaceBundle{ + raw_ptr, {ws_size_src, ws_size_filter, ws_size_dst, ws_size_z}}; + } + return WorkspaceBundle{raw_ptr, {ws_size_src, ws_size_filter, ws_size_dst}}; +} + +size_t +ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ws_src = ws.get(0); + auto ws_filter = ws.get(1); + auto ws_dst = ws.get(2); + auto&& stream = cuda_stream(args.opr->handle()); + + // reformat src from nchw4 to chwn4 + { + TensorLayout src{{n, ci / 4 * hi * wi}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.src_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = ws_src; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + } + + // reformat filter from nchw4 to chwn4 + { + TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.filter_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = ws_filter; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + } + + convolution::ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + + // process z + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout->ndim > 0) { + auto ws_z = ws.get(3); + + TensorLayout src{{n, co / 4 * ho * wo}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.z_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = ws_z; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + z_dev_ptr = reinterpret_cast(ws_z); + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + + convolution::PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm:: + dispatch_nonlinear_mode( + reinterpret_cast(ws_src), + reinterpret_cast(ws_filter), bias_visitor, + z_dev_ptr, reinterpret_cast(ws_dst), kern_param, + alpha, beta, gamma, dst_scale, stream, param.nonlineMode); + + // reformat chwn4 to nchw4 + { + TensorLayout src{{co / 4 * ho * wo, n}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = ws_dst; + ts_src.layout = src; + ts_dst.raw_ptr = args.dst_tensor->raw_ptr; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp new file mode 100644 index 00000000..4fa50b39 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp @@ -0,0 +1,193 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_imma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution_helper/bias_visitor.cuh" + +using namespace megdnn; +using namespace cuda; + +#if CUDA_VERSION >= 10000 +bool ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + bool available = true; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), + param.format)) + return false; + if (param.format != Format::NCHW4) + return false; + UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + // TODO support group conv + available &= param.sparse == Sparse::DENSE; + // mode must be cross correlation + available &= param.mode == Mode::CROSS_CORRELATION; + // check data type + auto src_dtype = args.src_layout->dtype, + filter_dtype = args.filter_layout->dtype, + bias_dtype = args.bias_layout->dtype, + dst_dtype = args.dst_layout->dtype; + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // check layout + available &= (ci % 16 == 0); + // TODO: support dialtion + available &= dh == 1 && dw == 1; + // only support sm_75 or later, platform should have tensorcore int8 + // support + available &= is_compute_capability_required(7, 5); + return available; +} + +WorkspaceBundle +ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + size_t ws_size_src = args.src_layout->span().dist_byte(); + size_t ws_size_filter = args.filter_layout->span().dist_byte(); + size_t ws_size_dst = args.dst_layout->span().dist_byte(); + if (args.z_layout->ndim > 0) { + size_t ws_size_z = args.z_layout->span().dist_byte(); + return WorkspaceBundle{ + raw_ptr, {ws_size_src, ws_size_filter, ws_size_dst, ws_size_z}}; + } + return WorkspaceBundle{raw_ptr, {ws_size_src, ws_size_filter, ws_size_dst}}; +} + +size_t +ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::exec( + const ExecArgs& args) const { + using Format = Param::Format; + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), + param); + auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ws_src = ws.get(0); + auto ws_filter = ws.get(1); + auto ws_dst = ws.get(2); + auto&& stream = cuda_stream(args.opr->handle()); + + // reformat src from nchw4 to chwn4 + { + TensorLayout src{{n, ci / 4 * hi * wi}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.src_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = ws_src; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + } + + // reformat filter from nchw4 to chwn4 + { + TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.filter_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = ws_filter; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + } + + convolution::ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float src_scale = args.src_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + bias_scale = + args.bias_layout->dtype.param().scale, + dst_scale = args.dst_layout->dtype.param().scale; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale; + + // process z + int8_t* z_dev_ptr = nullptr; + float gamma = 1.f; + if (args.z_layout->ndim > 0) { + auto ws_z = ws.get(3); + + TensorLayout src{{n, co / 4 * ho * wo}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.z_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = ws_z; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + z_dev_ptr = reinterpret_cast(ws_z); + float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; + } + + convolution::PerChannelBiasVisitor bias_visitor; + bias_visitor.bias = args.bias_tensor->compatible_ptr(); + ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::dispatch_nonlinear_mode< + convolution::PerChannelBiasVisitor>( + reinterpret_cast(ws_src), + reinterpret_cast(ws_filter), bias_visitor, z_dev_ptr, + reinterpret_cast(ws_dst), kern_param, alpha, beta, gamma, + dst_scale, stream, param.nonlineMode, m_mma_tile_size); + + // reformat chwn4 to nchw4 + { + TensorLayout src{{co / 4 * ho * wo, n}, dtype::Int32()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 1, dst.stride[1] = dst[0]; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = ws_dst; + ts_src.layout = src; + ts_dst.raw_ptr = args.dst_tensor->raw_ptr; + ts_dst.layout = dst; + auto&& transpose = + args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); + } +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/inplace_matmul.cpp b/dnn/src/cuda/conv_bias/inplace_matmul.cpp new file mode 100644 index 00000000..b65386eb --- /dev/null +++ b/dnn/src/cuda/conv_bias/inplace_matmul.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/conv_bias/inplace_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh" + +using namespace megdnn; +using namespace cuda; + +bool ConvBiasForwardImpl::AlgoInplaceMatmul::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + auto&& fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCHW && + args.src_layout->dtype == dtype::Float32() && fm.group == 1 && + fm.spatial_ndim == 2 && fm.dilation[0] == 1 && fm.dilation[1] == 1; +} + +size_t ConvBiasForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes( + const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + return dst_layout.span().dist_byte(); + } + return 0; +} + +void ConvBiasForwardImpl::AlgoInplaceMatmul::exec(const ExecArgs& args) const { + WorkspaceBundle bundle{args.workspace.raw_ptr, + {get_workspace_in_bytes(args)}}; + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(0); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + { + auto&& fm = args.filter_meta; + size_t N = args.src_layout->shape[0], IC = fm.icpg, + IH = args.src_layout->shape[2], IW = args.src_layout->shape[3], + OC = fm.ocpg, OH = conv_dst_tensor.layout.shape[2], + OW = conv_dst_tensor.layout.shape[3], FH = fm.spatial[0], + FW = fm.spatial[1]; + auto stream = args.handle->stream(); + conv_bias::exec_inplace_matmul_fwd( + args.src_tensor->ptr(), + args.filter_tensor->ptr(), + conv_dst_tensor.ptr(), N, + args.src_layout->stride[0], conv_dst_tensor.layout.stride[0], + IC, IH, IW, OC, OH, OW, FH, FW, fm.padding[0], fm.padding[1], + fm.stride[0], fm.stride[1], !fm.should_flip, stream); + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl new file mode 100644 index 00000000..6de410ef --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl @@ -0,0 +1,142 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, convolution::ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, convolution::ConvParam, float, float); + kern = nullptr; +#define CHK3_(n_, co_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK3(n_, co_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_ + ty_ - 1) / (ty_); \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, co_) \ + CHK3(n_, co_, 4, 16, 8) \ + CHK3(n_, co_, 8, 16, 8) CHK3(n_, co_, 16, 16, 8) +#define CHK(n_) \ + CHK3_(n_, 4, 4, 16, 8) \ + CHK3_(n_, 4, 8, 16, 8) \ + CHK3_(n_, 4, 16, 16, 8) \ + CHK2(n_, 32) \ + CHK2(n_, 64) \ + CHK2(n_, 128) + CHK(1); + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK +#undef CHK2 +#undef CHK3 +#undef CHK3_ + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, float beta, + cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue epilogue, convolution::ConvParam, float, + float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl new file mode 100644 index 00000000..3d56fa8d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl @@ -0,0 +1,182 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; +#define CHK3_(n_, co_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK3(n_, co_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_ + ty_ - 1) / (ty_); \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, co_) \ + CHK3(n_, co_, 4, 16, 8) \ + CHK3(n_, co_, 8, 16, 8) CHK3(n_, co_, 16, 16, 8) +#define CHK(n_) \ + CHK3_(n_, 4, 4, 16, 8) \ + CHK3_(n_, 4, 8, 16, 8) \ + CHK3_(n_, 4, 16, 16, 8) \ + CHK2(n_, 32) \ + CHK2(n_, 64) \ + CHK2(n_, 128) + CHK(1); + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK +#undef CHK2 +#undef CHK3 +#undef CHK3_ +#define CHK3(n_, co_, ci_, tx_, ty_) \ + if (param.n % n_ == 0) { \ + if (param.co % co_ == 0) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_) / (ty_); \ + static constexpr int reg_n = (n_) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, co_) \ + CHK3(n_, co_, 4, 16, 8) \ + CHK3(n_, co_, 8, 16, 8) CHK3(n_, co_, 16, 16, 8) +#define CHK(n_) \ + CHK2(n_, 32) \ + CHK2(n_, 64) \ + CHK2(n_, 128) + CHK(16); + CHK(32); + CHK(64); + CHK(128); + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl new file mode 100644 index 00000000..348b92ef --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl @@ -0,0 +1,222 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; +#define CHK3_(n_, co_, wo_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + if (param.wo % wo_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int reg_width = wo_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTraitUnrollWidth \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * DIVUP(param.wo, reg_width); \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, \ + ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, ConvTrait::FilterTileCount:: \ + block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } \ + } +#define CHK3(n_, co_, wo_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + if (param.wo % wo_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_ + ty_ - 1) / (ty_); \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int reg_width = wo_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTraitUnrollWidth \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * DIVUP(param.wo, reg_width); \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, \ + ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, ConvTrait::FilterTileCount:: \ + block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } \ + } +#define CHK2(n_, wo_, co_) \ + CHK3(n_, co_, wo_, 4, 16, 8) \ + CHK3(n_, co_, wo_, 8, 16, 8) \ + CHK3(n_, co_, wo_, 16, 16, 8) +#define CHK(n_, wo_) \ + CHK3_(n_, 4, wo_, 4, 16, 8) \ + CHK3_(n_, 4, wo_, 8, 16, 8) \ + CHK3_(n_, 4, wo_, 16, 16, 8) \ + CHK2(n_, wo_, 32) \ + CHK2(n_, wo_, 64) \ + CHK2(n_, wo_, 128) + CHK(1, 2); + CHK(1, 3); + CHK(1, 4); + CHK(1, 8); + CHK(16, 2); + CHK(16, 3); + CHK(16, 4); + CHK(16, 8); + CHK(32, 2); + CHK(32, 3); + CHK(32, 4); + CHK(64, 2); +#undef CHK +#undef CHK2 +#undef CHK3 +#undef CHK3_ +#define CHK3(n_, co_, wo_, ci_, tx_, ty_) \ + if (param.n % n_ == 0) { \ + if (param.co % co_ == 0) { \ + if (param.ci % ci_ == 0) { \ + if (param.wo % wo_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_) / (ty_); \ + static constexpr int reg_n = (n_) / (tx_); \ + static constexpr int reg_width = wo_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTraitUnrollWidth \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * DIVUP(param.wo, reg_width); \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, \ + ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, ConvTrait::FilterTileCount:: \ + block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } \ + } +#define CHK2(n_, wo_, co_) \ + CHK3(n_, co_, wo_, 4, 16, 8) \ + CHK3(n_, co_, wo_, 8, 16, 8) \ + CHK3(n_, co_, wo_, 16, 16, 8) +#define CHK(n_, wo_) \ + CHK2(n_, wo_, 32) \ + CHK2(n_, wo_, 64) \ + CHK2(n_, wo_, 128) + CHK(16, 2); + CHK(16, 3); + CHK(16, 4); + CHK(16, 8); + CHK(32, 2); + CHK(32, 3); + CHK(32, 4); + CHK(64, 2); +#undef CHK +#undef CHK2 +#undef CHK3 + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl new file mode 100644 index 00000000..a090a145 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl @@ -0,0 +1,165 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; +#define CHK3_(n_, co_, wo_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + if (param.wo % wo_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = 4; \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int reg_width = wo_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTraitUnrollWidth \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * DIVUP(param.wo, reg_width); \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, \ + ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, ConvTrait::FilterTileCount:: \ + block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } \ + } +#define CHK3(n_, co_, wo_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + if (param.wo % wo_ == 0) { \ + static constexpr int reg_k = (ci_); \ + static constexpr int reg_m = (co_ + ty_ - 1) / (ty_); \ + static constexpr int reg_n = (n_ + tx_ - 1) / (tx_); \ + static constexpr int reg_width = wo_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef RegBlockConfig \ + RegBlockConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvTraitUnrollWidth \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * DIVUP(param.wo, reg_width); \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, \ + ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, ConvTrait::FilterTileCount:: \ + block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot); \ + } \ + } \ + } \ + } +#define CHK2(n_, wo_, co_) \ + CHK3(n_, co_, wo_, 4, 16, 8) \ + CHK3(n_, co_, wo_, 8, 16, 8) \ + CHK3(n_, co_, wo_, 16, 16, 8) +#define CHK(n_, wo_) \ + CHK3_(n_, 4, wo_, 4, 16, 8) \ + CHK3_(n_, 4, wo_, 8, 16, 8) \ + CHK3_(n_, 4, wo_, 16, 16, 8) \ + CHK2(n_, wo_, 32) \ + CHK2(n_, wo_, 64) \ + CHK2(n_, wo_, 128) + CHK(1, 2); + CHK(1, 3); + CHK(1, 4); + CHK(1, 8); + CHK(16, 2); + CHK(16, 3); + CHK(16, 4); + CHK(16, 8); + CHK(32, 2); + CHK(32, 3); + CHK(32, 4); + CHK(64, 2); +#undef CHK +#undef CHK2 +#undef CHK3 +#undef CHK3_ + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias"); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_hswish.cu new file mode 100644 index 00000000..7b47bcd2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_id.cu new file mode 100644 index 00000000..fc560d5a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_relu.cu new file mode 100644 index 00000000..db642290 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_hswish.cu new file mode 100644 index 00000000..068eabcc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_id.cu new file mode 100644 index 00000000..78501106 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_relu.cu new file mode 100644 index 00000000..145b1995 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_ld_64bit_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_hswish.cu new file mode 100644 index 00000000..f5f24f4c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_id.cu new file mode 100644 index 00000000..b1ecb15c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_relu.cu new file mode 100644 index 00000000..73edc196 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_hswish.cu new file mode 100644 index 00000000..5b2d22c0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_id.cu new file mode 100644 index 00000000..66999930 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_relu.cu new file mode 100644 index 00000000..63d0b930 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl new file mode 100644 index 00000000..e19e9789 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 16; + static constexpr int wmma_n = 16; + static constexpr int wmma_k = 16; +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n >= _n) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = \ + ((_co) + warp_y * wmma_m - 1) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_n) + warp_x * wmma_n - 1) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 1) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(1); + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n % _n == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_n) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (n,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl new file mode 100644 index 00000000..e44dc571 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 16; + static constexpr int wmma_n = 16; + static constexpr int wmma_k = 16; +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n >= _n) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = \ + ((_co) + warp_y * wmma_m - 1) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_n) + warp_x * wmma_n - 1) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitReorderFilter< \ + true, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 1) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(1); + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n % _n == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_n) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitReorderFilter< \ + false, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (n,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl new file mode 100644 index 00000000..1ec9c194 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl @@ -0,0 +1,372 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 16; + static constexpr int wmma_n = 16; + static constexpr int wmma_k = 16; + +// common defs +#define DISPATCH_ODD(cb) \ + cb(1); \ + cb(3); \ + cb(5); \ + cb(7); +#define DISPATCH_EVEN(cb) \ + cb(2); \ + cb(4); \ + cb(6); \ + cb(8); +#define DISPATCH_BLOCK(cb1, cb2, cb3, cb4) \ + DISPATCH_ODD(cb1); \ + DISPATCH_EVEN(cb2); \ + if (param.n % wmma_n == 0) { \ + DISPATCH_ODD(cb3); \ + DISPATCH_EVEN(cb4); \ + } + if (param.fw == 1) { +#define DISPATCH_CHK(_wo, _co, _ci, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / (wmma_k); \ + static constexpr int warp_tile_m = \ + ((_co) + (warp_y * wmma_m - 1)) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_wo) + warp_x - 1) / (warp_x); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidth< \ + true, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_NOCHK(_wo, _co, _ci, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / (wmma_k); \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_wo) / (warp_x); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidth< \ + false, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +// dispatch block for fw = 3 +#define DISPATCH_CHK14(_wo, _co) \ + DISPATCH_CHK(_wo, _co, 16, 1, 4) \ + DISPATCH_CHK(_wo, _co, 32, 1, 4) DISPATCH_CHK(_wo, _co, 64, 1, 4) +#define DISPATCH_CHK22(_wo, _co) \ + DISPATCH_CHK(_wo, _co, 16, 2, 2) \ + DISPATCH_CHK(_wo, _co, 32, 2, 2) DISPATCH_CHK(_wo, _co, 64, 2, 2) +#define DISPATCH_NOCHK14(_wo, _co) \ + DISPATCH_NOCHK(_wo, _co, 16, 1, 4) \ + DISPATCH_NOCHK(_wo, _co, 32, 1, 4) DISPATCH_NOCHK(_wo, _co, 64, 1, 4) +#define DISPATCH_NOCHK22(_wo, _co) \ + DISPATCH_NOCHK(_wo, _co, 16, 2, 2) \ + DISPATCH_NOCHK(_wo, _co, 32, 2, 2) DISPATCH_NOCHK(_wo, _co, 64, 2, 2) +#define cb1(_wo) \ + DISPATCH_CHK14(_wo, 1) \ + DISPATCH_CHK14(_wo, 64) \ + DISPATCH_CHK14(_wo, 128) +#define cb2(_wo) \ + DISPATCH_CHK22(_wo, 1) \ + DISPATCH_CHK22(_wo, 32) \ + DISPATCH_CHK22(_wo, 64) \ + DISPATCH_CHK22(_wo, 128) +#define cb3(_wo) \ + DISPATCH_NOCHK14(_wo, 64) \ + DISPATCH_NOCHK14(_wo, 128) +#define cb4(_wo) \ + DISPATCH_NOCHK22(_wo, 32) \ + DISPATCH_NOCHK22(_wo, 64) \ + DISPATCH_NOCHK22(_wo, 128) + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); +#undef DISPATCH_CHK14 +#undef DISPATCH_CHK22 +#undef DISPATCH_NOCHK14 +#undef DISPATCH_NOCHK22 + } else if (param.fw == 3 && param.sw == 1) { +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_CHK +#undef DISPATCH_NOCHK +#define DISPATCH_CHK(_wo, _co, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co >= _co) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = 1; \ + static constexpr int warp_tile_m = \ + ((_co) + (warp_y * wmma_m - 1)) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_wo) + warp_x - 1) / (warp_x); \ + typedef Conv1dConfig Conv1dConfig; \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidthV2 \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } +#define DISPATCH_NOCHK(_wo, _co, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co % _co == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = 1; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_wo) / (warp_x); \ + typedef Conv1dConfig Conv1dConfig; \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidthV2 \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } +// dispatch block for fw = 3 +#define cb1(_wo) \ + DISPATCH_CHK(_wo, 1, 1, 4) \ + DISPATCH_CHK(_wo, 64, 1, 4) \ + DISPATCH_CHK(_wo, 128, 1, 4) +#define cb2(_wo) \ + DISPATCH_CHK(_wo, 1, 2, 2) \ + DISPATCH_CHK(_wo, 32, 2, 2) \ + DISPATCH_CHK(_wo, 64, 2, 2) \ + DISPATCH_CHK(_wo, 128, 2, 2) +#define cb3(_wo) \ + DISPATCH_NOCHK(_wo, 64, 1, 4) \ + DISPATCH_NOCHK(_wo, 128, 1, 4) +#define cb4(_wo) \ + DISPATCH_NOCHK(_wo, 32, 2, 2) \ + DISPATCH_NOCHK(_wo, 64, 2, 2) \ + DISPATCH_NOCHK(_wo, 128, 2, 2) + static constexpr int fw = 3; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + if (param.n % wmma_n == 0 && param.co == 16) { +#define DISPATCH(_wo) DISPATCH_NOCHK(_wo, 16, 4, 1) + DISPATCH(4); + DISPATCH(8); + DISPATCH(12); + DISPATCH(16); +#undef DISPATCH + } + } else if (param.fw == 3 && param.sw == 2) { + static constexpr int fw = 3; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + if (param.n % wmma_n == 0 && param.co == 16) { +#define DISPATCH(_wo) DISPATCH_NOCHK(_wo, 16, 4, 1) + DISPATCH(4); + DISPATCH(8); + DISPATCH(12); + DISPATCH(16); +#undef DISPATCH + } + } else if (param.fw == 5 && param.sw == 1) { +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +// dispatch block for fw = 5, 7 +#define cb1(_wo) \ + DISPATCH_CHK(_wo, 1, 1, 8) \ + DISPATCH_CHK(_wo, 128, 1, 8) +#define cb2(_wo) \ + DISPATCH_CHK(_wo, 1, 2, 4) \ + DISPATCH_CHK(_wo, 64, 2, 4) \ + DISPATCH_CHK(_wo, 128, 2, 4) +#define cb3(_wo) DISPATCH_NOCHK(_wo, 128, 1, 8) +#define cb4(_wo) \ + DISPATCH_NOCHK(_wo, 64, 2, 4) \ + DISPATCH_NOCHK(_wo, 128, 2, 4) + static constexpr int fw = 5; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 5 && param.sw == 2) { + static constexpr int fw = 5; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 7 && param.sw == 1) { + static constexpr int fw = 7; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 7 && param.sw == 2) { + static constexpr int fw = 7; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (fw,sw,n,co,ci)=(%d,%d,%d,%d,%d)", + param.fw, param.sw, param.n, param.co, param.ci); +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_BLOCK +#undef DISPATCH_CHK +#undef DISPATCH_NOCHK +#undef DISPATCH_ODD +#undef DISPATCH_EVEN + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl new file mode 100644 index 00000000..369817bd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 32; + static constexpr int wmma_n = 8; + static constexpr int wmma_k = 16; +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n >= _n) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = \ + ((_co) + warp_y * wmma_m - 1) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_n) + warp_x * wmma_n - 1) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 1) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(1); + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n % _n == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_n) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (n,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl new file mode 100644 index 00000000..a5e2cedd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 32; + static constexpr int wmma_n = 8; + static constexpr int wmma_k = 16; +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n >= _n) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = \ + ((_co) + warp_y * wmma_m - 1) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_n) + warp_x * wmma_n - 1) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitReorderFilter< \ + true, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 1) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(1); + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n % _n == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_n) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitReorderFilter< \ + false, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (n,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl new file mode 100644 index 00000000..8ae743cb --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl @@ -0,0 +1,360 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 32; + static constexpr int wmma_n = 8; + static constexpr int wmma_k = 16; + +// common defs +#define DISPATCH_ODD(cb) \ + cb(1); \ + cb(3); \ + cb(5); \ + cb(7); \ + cb(15); +#define DISPATCH_EVEN(cb) \ + cb(2); \ + cb(4); \ + cb(6); \ + cb(8); \ + cb(16); +#define DISPATCH_BLOCK(cb1, cb2, cb3, cb4) \ + DISPATCH_ODD(cb1); \ + DISPATCH_EVEN(cb2); \ + if (param.n % wmma_n == 0) { \ + DISPATCH_ODD(cb3); \ + DISPATCH_EVEN(cb4); \ + } + if (param.fw == 1) { +#define DISPATCH_CHK(_wo, _co, _ci, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / (wmma_k); \ + static constexpr int warp_tile_m = \ + ((_co) + (warp_y * wmma_m - 1)) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_wo) + warp_x - 1) / (warp_x); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidth< \ + true, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_NOCHK(_wo, _co, _ci, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / (wmma_k); \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_wo) / (warp_x); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidth< \ + false, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +// dispatch block for fw = 3 +#define DISPATCH_CHK14(_wo, _co) \ + DISPATCH_CHK(_wo, _co, 16, 1, 4) \ + DISPATCH_CHK(_wo, _co, 32, 1, 4) DISPATCH_CHK(_wo, _co, 64, 1, 4) +#define DISPATCH_CHK22(_wo, _co) \ + DISPATCH_CHK(_wo, _co, 16, 2, 2) \ + DISPATCH_CHK(_wo, _co, 32, 2, 2) DISPATCH_CHK(_wo, _co, 64, 2, 2) +#define DISPATCH_NOCHK14(_wo, _co) \ + DISPATCH_NOCHK(_wo, _co, 16, 1, 4) \ + DISPATCH_NOCHK(_wo, _co, 32, 1, 4) DISPATCH_NOCHK(_wo, _co, 64, 1, 4) +#define DISPATCH_NOCHK22(_wo, _co) \ + DISPATCH_NOCHK(_wo, _co, 16, 2, 2) \ + DISPATCH_NOCHK(_wo, _co, 32, 2, 2) DISPATCH_NOCHK(_wo, _co, 64, 2, 2) +#define cb1(_wo) \ + DISPATCH_CHK14(_wo, 1) \ + DISPATCH_CHK14(_wo, 128) +#define cb2(_wo) \ + DISPATCH_CHK22(_wo, 1) \ + DISPATCH_CHK22(_wo, 64) \ + DISPATCH_CHK22(_wo, 128) +#define cb3(_wo) DISPATCH_NOCHK14(_wo, 128) +#define cb4(_wo) \ + DISPATCH_NOCHK22(_wo, 64) \ + DISPATCH_NOCHK22(_wo, 128) + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); +#undef DISPATCH_CHK14 +#undef DISPATCH_CHK22 +#undef DISPATCH_NOCHK14 +#undef DISPATCH_NOCHK22 + } else if (param.fw == 3 && param.sw == 1) { +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_CHK +#undef DISPATCH_NOCHK +#define DISPATCH_CHK(_wo, _co, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co >= _co) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = 1; \ + static constexpr int warp_tile_m = \ + ((_co) + (warp_y * wmma_m - 1)) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_wo) + warp_x - 1) / (warp_x); \ + typedef Conv1dConfig Conv1dConfig; \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidthV2 \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } +#define DISPATCH_NOCHK(_wo, _co, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co % _co == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = 1; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_wo) / (warp_x); \ + typedef Conv1dConfig Conv1dConfig; \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidthV2 \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } +// dispatch block for fw = 3 +#define cb1(_wo) \ + DISPATCH_CHK(_wo, 1, 1, 4) \ + DISPATCH_CHK(_wo, 128, 1, 4) +#define cb2(_wo) \ + DISPATCH_CHK(_wo, 1, 2, 2) \ + DISPATCH_CHK(_wo, 64, 2, 2) \ + DISPATCH_CHK(_wo, 128, 2, 2) +#define cb3(_wo) DISPATCH_NOCHK(_wo, 128, 1, 4) +#define cb4(_wo) \ + DISPATCH_NOCHK(_wo, 64, 2, 2) \ + DISPATCH_NOCHK(_wo, 128, 2, 2) + static constexpr int fw = 3; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 3 && param.sw == 2) { + static constexpr int fw = 3; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 5 && param.sw == 1) { +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_ODD +#undef DISPATCH_EVEN +#define DISPATCH_ODD(cb) \ + cb(1); \ + cb(3); \ + cb(5); \ + cb(7); +#define DISPATCH_EVEN(cb) \ + cb(2); \ + cb(4); \ + cb(6); \ + cb(8); +// dispatch block for fw = 5, 7 +#define cb1(_wo) \ + DISPATCH_CHK(_wo, 1, 1, 4) \ + DISPATCH_CHK(_wo, 128, 1, 4) +#define cb2(_wo) \ + DISPATCH_CHK(_wo, 1, 2, 2) \ + DISPATCH_CHK(_wo, 64, 2, 2) \ + DISPATCH_CHK(_wo, 128, 2, 2) +#define cb3(_wo) DISPATCH_NOCHK(_wo, 128, 1, 4) +#define cb4(_wo) \ + DISPATCH_NOCHK(_wo, 64, 2, 2) \ + DISPATCH_NOCHK(_wo, 128, 2, 2) + static constexpr int fw = 5; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 5 && param.sw == 2) { + static constexpr int fw = 5; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 7 && param.sw == 1) { + static constexpr int fw = 7; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 7 && param.sw == 2) { + static constexpr int fw = 7; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (fw,sw,n,co,ci)=(%d,%d,%d,%d,%d)", + param.fw, param.sw, param.n, param.co, param.ci); +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_BLOCK +#undef DISPATCH_CHK +#undef DISPATCH_NOCHK +#undef DISPATCH_ODD +#undef DISPATCH_EVEN + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl new file mode 100644 index 00000000..869d4cd4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 8; + static constexpr int wmma_n = 32; + static constexpr int wmma_k = 16; +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n >= _n) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = \ + ((_co) + warp_y * wmma_m - 1) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_n) + warp_x * wmma_n - 1) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 1) \ + CHK2(_n, 16) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(1); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n % _n == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_n) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATrait \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 16) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (n,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl new file mode 100644 index 00000000..6a8219dd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl @@ -0,0 +1,169 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 8; + static constexpr int wmma_n = 32; + static constexpr int wmma_k = 16; +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n >= _n) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = \ + ((_co) + warp_y * wmma_m - 1) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_n) + warp_x * wmma_n - 1) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitReorderFilter< \ + true, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 1) \ + CHK2(_n, 16) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(1); + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK +#define CHK3(_n, _co, _ci, _warp_x, _warp_y) \ + if (param.n % _n == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / wmma_k; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_n) / (warp_x * wmma_n); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitReorderFilter< \ + false, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.ho * param.wo; \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define CHK2(_n, _co) \ + CHK3(_n, _co, 16, 2, 2) CHK3(_n, _co, 32, 2, 2) CHK3(_n, _co, 64, 2, 2) +#define CHK(_n) \ + CHK2(_n, 16) \ + CHK2(_n, 32) \ + CHK2(_n, 64) \ + CHK2(_n, 128) + CHK(64); + CHK(128); +#undef CHK3 +#undef CHK2 +#undef CHK + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (n,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl new file mode 100644 index 00000000..a05af736 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl @@ -0,0 +1,364 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/conv_bias_int8.cuh" +#include "src/cuda/convolution_helper/kernel.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +template +void (*get_kern(const ConvParam& param, + conv_bias_int8::LaunchConfig& launch_config))( + const int8_t* __restrict__, const int8_t* __restrict__, BiasVisitor, + Epilogue, ConvParam, float, float) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + kern = nullptr; + static constexpr int wmma_m = 8; + static constexpr int wmma_n = 32; + static constexpr int wmma_k = 16; +// common defs +#define DISPATCH_ODD(cb) \ + cb(1); \ + cb(3); +#define DISPATCH_EVEN(cb) \ + cb(2); \ + cb(4); +#define DISPATCH_BLOCK(cb1, cb2, cb3, cb4) \ + DISPATCH_ODD(cb1); \ + DISPATCH_EVEN(cb2); \ + if (param.n % wmma_n == 0) { \ + DISPATCH_ODD(cb3); \ + DISPATCH_EVEN(cb4); \ + } + if (param.fw == 1) { +#define DISPATCH_CHK(_wo, _co, _ci, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co >= _co) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / (wmma_k); \ + static constexpr int warp_tile_m = \ + ((_co) + (warp_y * wmma_m - 1)) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_wo) + warp_x - 1) / (warp_x); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidth< \ + true, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +#define DISPATCH_NOCHK(_wo, _co, _ci, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co % _co == 0) { \ + if (param.ci % _ci == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = (_ci) / (wmma_k); \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_wo) / (warp_x); \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidth< \ + false, IMMAConfig, WarpTileConfig, ThreadConfig> \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = DIVUP( \ + param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } \ + } +// dispatch block for fw = 3 +#define DISPATCH_CHK14(_wo, _co) \ + DISPATCH_CHK(_wo, _co, 16, 1, 4) \ + DISPATCH_CHK(_wo, _co, 32, 1, 4) DISPATCH_CHK(_wo, _co, 64, 1, 4) +#define DISPATCH_CHK22(_wo, _co) \ + DISPATCH_CHK(_wo, _co, 16, 2, 2) \ + DISPATCH_CHK(_wo, _co, 32, 2, 2) DISPATCH_CHK(_wo, _co, 64, 2, 2) +#define DISPATCH_NOCHK14(_wo, _co) \ + DISPATCH_NOCHK(_wo, _co, 16, 1, 4) \ + DISPATCH_NOCHK(_wo, _co, 32, 1, 4) DISPATCH_NOCHK(_wo, _co, 64, 1, 4) +#define DISPATCH_NOCHK22(_wo, _co) \ + DISPATCH_NOCHK(_wo, _co, 16, 2, 2) \ + DISPATCH_NOCHK(_wo, _co, 32, 2, 2) DISPATCH_NOCHK(_wo, _co, 64, 2, 2) +#define cb1(_wo) \ + DISPATCH_CHK14(_wo, 1) \ + DISPATCH_CHK14(_wo, 32) \ + DISPATCH_CHK14(_wo, 64) \ + DISPATCH_CHK14(_wo, 128) +#define cb2(_wo) \ + DISPATCH_CHK22(_wo, 1) \ + DISPATCH_CHK22(_wo, 16) \ + DISPATCH_CHK22(_wo, 32) \ + DISPATCH_CHK22(_wo, 64) \ + DISPATCH_CHK22(_wo, 128) +#define cb3(_wo) \ + DISPATCH_NOCHK14(_wo, 32) \ + DISPATCH_NOCHK14(_wo, 64) \ + DISPATCH_NOCHK14(_wo, 128) +#define cb4(_wo) \ + DISPATCH_NOCHK22(_wo, 16) \ + DISPATCH_NOCHK22(_wo, 32) \ + DISPATCH_NOCHK22(_wo, 64) \ + DISPATCH_NOCHK22(_wo, 128) + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); +#undef DISPATCH_CHK14 +#undef DISPATCH_CHK22 +#undef DISPATCH_NOCHK14 +#undef DISPATCH_NOCHK22 + } else if (param.fw == 3 && param.sw == 1) { +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_CHK +#undef DISPATCH_NOCHK +#define DISPATCH_CHK(_wo, _co, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co >= _co) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = 1; \ + static constexpr int warp_tile_m = \ + ((_co) + (warp_y * wmma_m - 1)) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = \ + ((_wo) + warp_x - 1) / (warp_x); \ + typedef Conv1dConfig Conv1dConfig; \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidthV2 \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } +#define DISPATCH_NOCHK(_wo, _co, _warp_x, _warp_y) \ + if (param.wo % _wo == 0) { \ + if (param.co % _co == 0) { \ + static constexpr int warp_x = _warp_x; \ + static constexpr int warp_y = _warp_y; \ + static constexpr int thread_x = warp_x * WARP_SIZE; \ + static constexpr int thread_y = warp_y; \ + static constexpr int warp_tile_k = 1; \ + static constexpr int warp_tile_m = (_co) / (warp_y * wmma_m); \ + static constexpr int warp_tile_n = (_wo) / (warp_x); \ + typedef Conv1dConfig Conv1dConfig; \ + typedef IMMAConfig IMMAConfig; \ + typedef WarpTileConfig \ + WarpTileConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef IConvIMMATraitUnrollWidthV2 \ + ConvTrait; \ + kern = convolution_kernel; \ + launch_config.nr_threads_x = ThreadConfig::nr_thread_x; \ + launch_config.nr_threads_y = ThreadConfig::nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.ho * \ + DIVUP(param.wo, \ + ConvTrait::DataTileCount::block_tile_out_width); \ + launch_config.nr_blocks_y = DIVUP( \ + param.n, ConvTrait::DataTileCount::block_tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, \ + ConvTrait::FilterTileCount::block_tile_out_channel); \ + launch_config.smem_size_in_bytes = \ + sizeof(int32_t) * \ + (ConvTrait::DataTileCount::smem_tot + \ + ConvTrait::FilterTileCount::smem_tot + \ + ConvTrait::GlobalMemoryStoreCount::smem_tot); \ + } \ + } +// dispatch block for fw = 3 +#define cb1(_wo) \ + DISPATCH_CHK(_wo, 1, 1, 4) \ + DISPATCH_CHK(_wo, 32, 1, 4) \ + DISPATCH_CHK(_wo, 64, 1, 4) \ + DISPATCH_CHK(_wo, 128, 1, 4) +#define cb2(_wo) \ + DISPATCH_CHK(_wo, 1, 2, 2) \ + DISPATCH_CHK(_wo, 16, 2, 2) \ + DISPATCH_CHK(_wo, 32, 2, 2) \ + DISPATCH_CHK(_wo, 64, 2, 2) \ + DISPATCH_CHK(_wo, 128, 2, 2) +#define cb3(_wo) \ + DISPATCH_NOCHK(_wo, 32, 1, 4) \ + DISPATCH_NOCHK(_wo, 64, 1, 4) \ + DISPATCH_NOCHK(_wo, 128, 1, 4) +#define cb4(_wo) \ + DISPATCH_NOCHK(_wo, 16, 2, 2) \ + DISPATCH_NOCHK(_wo, 32, 2, 2) \ + DISPATCH_NOCHK(_wo, 64, 2, 2) \ + DISPATCH_NOCHK(_wo, 128, 2, 2) + static constexpr int fw = 3; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 3 && param.sw == 2) { + static constexpr int fw = 3; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 5 && param.sw == 1) { +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +// dispatch block for fw = 5, 7 +#define cb1(_wo) \ + DISPATCH_CHK(_wo, 1, 1, 8) \ + DISPATCH_CHK(_wo, 64, 1, 8) \ + DISPATCH_CHK(_wo, 128, 1, 8) +#define cb2(_wo) \ + DISPATCH_CHK(_wo, 1, 2, 4) \ + DISPATCH_CHK(_wo, 32, 2, 4) \ + DISPATCH_CHK(_wo, 64, 2, 4) \ + DISPATCH_CHK(_wo, 128, 2, 4) +#define cb3(_wo) \ + DISPATCH_NOCHK(_wo, 64, 1, 8) \ + DISPATCH_NOCHK(_wo, 128, 1, 8) +#define cb4(_wo) \ + DISPATCH_NOCHK(_wo, 32, 2, 4) \ + DISPATCH_NOCHK(_wo, 64, 2, 4) \ + DISPATCH_NOCHK(_wo, 128, 2, 4) + static constexpr int fw = 5; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 5 && param.sw == 2) { + static constexpr int fw = 5; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 7 && param.sw == 1) { + static constexpr int fw = 7; + static constexpr int sw = 1; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } else if (param.fw == 7 && param.sw == 2) { + static constexpr int fw = 7; + static constexpr int sw = 2; + DISPATCH_BLOCK(cb1, cb2, cb3, cb4); + } + megdnn_assert(kern != nullptr, + "no usable kernel implementation for " + "conv_bias (fw,sw,n,co,ci)=(%d,%d,%d,%d,%d)", + param.fw, param.sw, param.n, param.co, param.ci); +#undef cb1 +#undef cb2 +#undef cb3 +#undef cb4 +#undef DISPATCH_BLOCK +#undef DISPATCH_CHK +#undef DISPATCH_NOCHK +#undef DISPATCH_ODD +#undef DISPATCH_EVEN + return kern; +} +} // namespace + +template +void megdnn::cuda::conv_bias_int8:: + do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width( + const int8_t* d_src, const int8_t* d_filter, BiasVisitor bias, + Epilogue epilogue, const ConvParam& param, float alpha, + float beta, cudaStream_t stream) { + void (*kern)(const int8_t* __restrict__, const int8_t* __restrict__, + BiasVisitor, Epilogue, ConvParam, float, float); + conv_bias_int8::LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + cuda_check(cudaFuncSetCacheConfig(reinterpret_cast(kern), + cudaFuncCachePreferShared)); + cuda_check(cudaFuncSetSharedMemConfig(reinterpret_cast(kern), + cudaSharedMemBankSizeEightByte)); + + kern<<>>( + d_src, d_filter, bias, epilogue, param, alpha, beta); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_hswish.cu new file mode 100644 index 00000000..e6844030 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_id.cu new file mode 100644 index 00000000..34e323ff --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_relu.cu new file mode 100644 index 00000000..4a51b905 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu new file mode 100644 index 00000000..f2e3b92f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_id.cu new file mode 100644 index 00000000..a0f223ee --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu new file mode 100644 index 00000000..72b3ef0e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu new file mode 100644 index 00000000..03d436d1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_id.cu new file mode 100644 index 00000000..c6e6d1a5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_relu.cu new file mode 100644 index 00000000..cb4f123c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma16x16x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_hswish.cu new file mode 100644 index 00000000..8202b911 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_id.cu new file mode 100644 index 00000000..278f7fc1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_relu.cu new file mode 100644 index 00000000..1e75ac20 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu new file mode 100644 index 00000000..b3c64acb --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_id.cu new file mode 100644 index 00000000..c4dd83d8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu new file mode 100644 index 00000000..bef500ad --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu new file mode 100644 index 00000000..f8387bd4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_id.cu new file mode 100644 index 00000000..e60e94e9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_relu.cu new file mode 100644 index 00000000..713a0a24 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma32x8x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_hswish.cu new file mode 100644 index 00000000..1cd8d413 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_id.cu new file mode 100644 index 00000000..66ff72df --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_relu.cu new file mode 100644 index 00000000..9c22fea6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu new file mode 100644 index 00000000..4dff4381 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_id.cu new file mode 100644 index 00000000..d9ff06e4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu new file mode 100644 index 00000000..d495b6e9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_reorder_filter>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu new file mode 100644 index 00000000..a601e2b6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_hswish.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_id.cu new file mode 100644 index 00000000..c17d2df7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_id.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_id.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_relu.cu new file mode 100644 index 00000000..bb46408c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_relu.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width_per_chan_relu.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_cuda_conv_bias_kern_impls.py +#include "../conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width.cuinl" + +template void megdnn::cuda::conv_bias_int8::do_conv_bias_int8_implicit_gemm_imma8x32x16_cdiv4hwn4_unroll_width>>( + const int8_t* d_src, + const int8_t* d_filter, + PerChannelBiasVisitor bias, + IConvEpilogue> epilogue, + const ConvParam& param, + float alpha, + float beta, + cudaStream_t stream); diff --git a/dnn/src/cuda/conv_bias/matmul.cpp b/dnn/src/cuda/conv_bias/matmul.cpp new file mode 100644 index 00000000..1f7956a6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul.cpp @@ -0,0 +1,137 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/conv_bias/matmul/im2col.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +bool ConvBiasForwardImpl::AlgoMatmul::is_available(const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + auto&& fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCHW && + args.src_layout->dtype.category() == DTypeCategory::FLOAT && + fm.group == 1 && fm.spatial_ndim == 2; +} + +WorkspaceBundle ConvBiasForwardImpl::AlgoMatmul::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto dst_layout = *args.dst_layout; + SmallVector sizes; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + SizeArgs conv_args = args; + conv_args.dst_layout = &dst_layout; + SmallVector matmul_sizes; + WorkspaceBundle matmul_bundle = matmul_get_workspace_bundle(conv_args); + for (size_t i = 0; i < matmul_bundle.nr_workspace(); ++i) { + matmul_sizes.push_back(matmul_bundle.get_size(i)); + } + sizes.insert(sizes.begin(), matmul_sizes.begin(), matmul_sizes.end()); + return {ptr, std::move(sizes)}; +} + +size_t ConvBiasForwardImpl::AlgoMatmul::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoMatmul::exec(const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + auto conv_dst_tensor = *args.dst_tensor; + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + + ExecArgs conv_args = args; + conv_args.dst_tensor = &conv_dst_tensor; + { + switch (conv_args.src_layout->dtype.enumv()) { +#define cb(dt) \ + case DTypeTrait
::enumv: { \ + using ctype = typename DTypeTrait
::ctype; \ + exec_internal(conv_args, bundle); \ + break; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb); +#undef cb + default: + megdnn_assert_internal(0); + } + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +template +void ConvBiasForwardImpl::AlgoMatmul::exec_internal( + const ExecArgs& args, const WorkspaceBundle& bundle) { + auto&& fm = args.filter_meta; + size_t N = args.src_layout->shape[0], IC = fm.icpg, + IH = args.src_layout->shape[2], IW = args.src_layout->shape[3], + OC = fm.ocpg, OH = args.dst_tensor->layout.shape[2], + OW = args.dst_tensor->layout.shape[3], FH = fm.spatial[0], + FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1], + SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0], + DW = fm.dilation[1]; + auto stream = cuda_stream(args.handle); + T* dst_t = static_cast(bundle.get(0)); + T* col = static_cast(bundle.get(1)); + conv_bias::im2col(args.src_tensor->ptr(), col, N, + args.src_layout->stride[0], IC, IH, IW, FH, FW, OH, OW, + PH, PW, SH, SW, DH, DW, stream); + TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait::dtype()), + Bl({IC * FH * FW, OH * OW * N}, typename DTypeTrait::dtype()), + Cl({OC, OH * OW * N}, typename DTypeTrait::dtype()); + TensorND A(args.filter_tensor->ptr(), Al), B(col, Bl), C(dst_t, Cl); + if (fm.should_flip) { + conv_bias::flip_filter(args, bundle.get_workspace(2), A.raw_ptr); + } + auto&& matmul_opr = args.handle->create_operator(); + if (args.opr->param().compute_mode == + param::Convolution::ComputeMode::FLOAT32) { + matmul_opr->param().compute_mode = + param::MatrixMul::ComputeMode::FLOAT32; + } + megdnn_assert(matmul_opr->get_workspace_in_bytes(A.layout, B.layout, + C.layout) == 0_z, + "Assume matmul opr in algo MATMUL doesn't need extra " + "workspace"); + matmul_opr->exec(A, B, C, Workspace()); + + TensorLayout C2l({OC * OH * OW, N}, typename DTypeTrait::dtype()), + C3l = C2l; + C3l.stride[0] = 1; + C3l.stride[1] = args.dst_tensor->layout.stride[0]; + TensorND C2(dst_t, C2l); + TensorND C3(args.dst_tensor->ptr(), C3l); + args.handle->relayout_opr()->exec(C2, C3); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul/im2col.cu b/dnn/src/cuda/conv_bias/matmul/im2col.cu new file mode 100644 index 00000000..0f19e956 --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul/im2col.cu @@ -0,0 +1,139 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul/im2col.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/dtype.h" +#include "src/cuda/conv_bias/matmul/im2col.cuh" +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; + +namespace { + +template +__global__ void im2col_kernel(const T* im, T* col, uint32_t N, uint32_t INP_BS, + uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t FH, uint32_t FW, uint32_t OH, + uint32_t OW, uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, uint32_t DH, + uint32_t DW) { + uint32_t n = threadIdx.x + blockIdx.y * blockDim.x; + uint32_t ow = threadIdx.y + blockIdx.z * blockDim.y; + uint32_t oh = blockIdx.x % OH; + uint32_t fw = blockIdx.x / OH % FW; + uint32_t fh = blockIdx.x / OH / FW % FH; + uint32_t ic = blockIdx.x / OH / FW / FH; + if (n < N && ow < OW) { + uint32_t didx = blockIdx.x * OW * N + ow * N + n; + uint32_t ih = -PH + oh * SH + fh * DH; + uint32_t iw = -PW + ow * SW + fw * DW; + col[didx] = (ih < IH && iw < IW + ? im[n * INP_BS + ic * IH * IW + ih * IW + iw] + : T(0.0f)); + } +} + +template +__global__ void col2im_kernel(const T* col, T* im, uint32_t N, uint32_t INP_BS, + uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t FH, uint32_t FW, uint32_t OH, + uint32_t OW, uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, uint32_t DH, + uint32_t DW) { + uint32_t iw = threadIdx.x + blockIdx.y * blockDim.x; + uint32_t ih = threadIdx.y + blockIdx.z * blockDim.y; + uint32_t ic = blockIdx.x % IC; + uint32_t n = blockIdx.x / IC; + if (iw < IW && ih < IH) { + T res(0); + // ih = -ph + oh*sh + fh*dh + // ih + ph - fh*dh == oh*sh + for (uint32_t fh = 0; fh < FH; ++fh) { + uint32_t anchorh = ih + PH - fh * DH; + if (anchorh < OH * SH && anchorh % SH == 0) { + uint32_t oh = anchorh / SH; + for (uint32_t fw = 0; fw < FW; ++fw) { + uint32_t anchorw = iw + PW - fw * DW; + if (anchorw < OW * SW && anchorw % SW == 0) { + uint32_t ow = anchorw / SW; + res += col[ic * FH * FW * OH * OW * N + + fh * FW * OH * OW * N + fw * OH * OW * N + + oh * OW * N + ow * N + n]; + } + } + } + } + im[n * INP_BS + ic * IH * IW + ih * IW + iw] = res; + } +} + +} // anonymous namespace + +template +void conv_bias::im2col(const T* im, T* col, size_t N, size_t INP_BS, size_t IC, + size_t IH, size_t IW, size_t FH, size_t FW, size_t OH, + size_t OW, size_t PH, size_t PW, size_t SH, size_t SW, + size_t DH, size_t DW, cudaStream_t stream) { + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + // dim3 blocks(DIVUP(N, NR_THREADS_X), DIVUP(OW, NR_THREADS_Y), + // IC*FH*FW*OH); IC*FH*FW*OH can be larger than 65536; shuffling blocks + // dimensions to put IC*FH*FW*OH to the first dimension. + dim3 blocks(IC * FH * FW * OH, DIVUP(N, NR_THREADS_X), + DIVUP(OW, NR_THREADS_Y)); + im2col_kernel<<>>(im, col, N, INP_BS, IC, IH, + IW, FH, FW, OH, OW, PH, PW, + SH, SW, DH, DW); + after_kernel_launch(); +} + +template +void conv_bias::col2im(const T* col, T* im, size_t N, size_t INP_BS, size_t IC, + size_t IH, size_t IW, size_t FH, size_t FW, size_t OH, + size_t OW, size_t PH, size_t PW, size_t SH, size_t SW, + size_t DH, size_t DW, cudaStream_t stream) { + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + // (x, y, z) is shuffled to (y, z, x) to bypass CUDA launch shape + // limitation. dim3 blocks(DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y), + // N*IC); + dim3 blocks(N * IC, DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y)); + col2im_kernel<<>>(col, im, N, INP_BS, IC, IH, + IW, FH, FW, OH, OW, PH, PW, + SH, SW, DH, DW); + after_kernel_launch(); +} + +namespace megdnn { +namespace cuda { +namespace conv_bias { + +#define DO_INST(T) \ + template void im2col(const T* im, T* col, size_t N, size_t INP_BS, \ + size_t IC, size_t IH, size_t IW, size_t FH, \ + size_t FW, size_t OH, size_t OW, size_t PH, \ + size_t PW, size_t SH, size_t SW, size_t DH, \ + size_t DW, cudaStream_t stream); \ + template void col2im(const T* col, T* im, size_t N, size_t INP_BS, \ + size_t IC, size_t IH, size_t IW, size_t FH, \ + size_t FW, size_t OH, size_t OW, size_t PH, \ + size_t PW, size_t SH, size_t SW, size_t DH, \ + size_t DW, cudaStream_t stream); + +#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST); + +#undef DO_INST +#undef INST + +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul/im2col.cuh b/dnn/src/cuda/conv_bias/matmul/im2col.cuh new file mode 100644 index 00000000..bd283bc3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul/im2col.cuh @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul/im2col.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include + +namespace megdnn { +namespace cuda { +namespace conv_bias { + +//! col is of shape (ic*fh*fw, oh*ow*n) +template +void im2col(const T* im, T* col, size_t N, size_t INP_BS, size_t IC, size_t IH, + size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH, + size_t PW, size_t SH, size_t SW, size_t DH, size_t DW, // dilation + cudaStream_t stream); + +template +void col2im(const T* col, T* im, size_t N, size_t INP_BS, size_t IC, size_t IH, + size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH, + size_t PW, size_t SH, size_t SW, size_t DH, size_t DW, // dilation + cudaStream_t stream); + +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cu b/dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cu new file mode 100644 index 00000000..201b23f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cu @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/matmul/im2col_nhwc_int8.cuh" + +#include "src/cuda/utils.cuh" + +namespace { + +template +__global__ void im2col_kern(const int8_t* __restrict src, + int8_t* __restrict unrolled, uint32_t N, + uint32_t IH, uint32_t IW, uint32_t IC, uint32_t IWS, + uint32_t OH, uint32_t OW, uint32_t OC, uint32_t OWS, + uint32_t FH, uint32_t FW, uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, uint32_t DH, uint32_t DW, + uint32_t LD) { + uint32_t ic = blockIdx.x * 32 + threadIdx.x; + uint32_t ow = blockIdx.y * 4 + threadIdx.y; + uint32_t oh = blockIdx.z * 4 + threadIdx.z; + uint32_t offset = (oh * OW + ow) * LD + ic; + if (ic < IC && ow < OW && oh < OH) { + for (uint32_t fh = 0; fh < FH; ++fh) { + for (size_t fw = 0; fw < FW; ++fw) { + uint32_t ih = -PH + oh * SH + (flip ? FH - fh - 1 : fh) * DH; + uint32_t iw = -PW + ow * SW + (flip ? FW - fw - 1 : fw) * DW; + uint32_t i = offset + (fh * FW + fw) * IC; + if (ih < IH && iw < IW) { + unrolled[i] = src[(ih * IW + iw) * IWS + ic]; + } else { + unrolled[i] = 0; + } + } + } + } +} + +} // anonymous namespace + +void megdnn::cuda::im2col_nhwc_int8(const int8_t* src, int8_t* unrolled, + uint32_t N, uint32_t IH, uint32_t IW, + uint32_t IC, uint32_t IWS, uint32_t OH, + uint32_t OW, uint32_t OC, uint32_t OWS, + uint32_t FH, uint32_t FW, uint32_t PH, + uint32_t PW, uint32_t SH, uint32_t SW, + uint32_t DH, uint32_t DW, uint32_t LD, + bool flip, cudaStream_t stream) { + dim3 nthreads = dim3(32, 4, 4); + dim3 nblocks = dim3(DIVUP(IC, 32), DIVUP(OW, 4), DIVUP(OH, 4)); + void (*kern_ptr)(const int8_t* __restrict src, int8_t* __restrict unrolled, + uint32_t N, uint32_t IH, uint32_t IW, uint32_t IC, + uint32_t IWS, uint32_t OH, uint32_t OW, uint32_t OC, + uint32_t OWS, uint32_t FH, uint32_t FW, uint32_t PH, + uint32_t PW, uint32_t SH, uint32_t SW, uint32_t DH, + uint32_t DW, uint32_t LD); + if (flip) { + kern_ptr = im2col_kern; + } else { + kern_ptr = im2col_kern; + } + for (size_t n = 0; n < N; ++n) { + kern_ptr<<>>( + src + n * IH * IW * IWS, unrolled + n * OH * OW * LD, N, IH, IW, + IC, IWS, OH, OW, OC, OWS, FH, FW, PH, PW, SH, SW, DH, DW, LD); + } + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cuh b/dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cuh new file mode 100644 index 00000000..1c78cf3f --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cuh @@ -0,0 +1,28 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul/im2col_nhwc_int8.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include + +namespace megdnn { +namespace cuda { + +void im2col_nhwc_int8(const int8_t* src, int8_t* unrolled, uint32_t N, + uint32_t IH, uint32_t IW, uint32_t IC, uint32_t IWS, + uint32_t OH, uint32_t OW, uint32_t OC, uint32_t OWS, + uint32_t FH, uint32_t FW, uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, uint32_t DH, uint32_t DW, + uint32_t LD, bool flip, cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu new file mode 100644 index 00000000..354c28d2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu @@ -0,0 +1,392 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh" +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; + +namespace { + +struct BufferFetcherTexture { + cudaTextureObject_t tex; + + __device__ __forceinline__ float get(uint32_t offset) { + return tex1Dfetch(tex, offset); + } +}; + +struct BufferFetcherRaw { + const float* ptr; + + __device__ __forceinline__ float get(uint32_t offset) { + return ptr[offset]; + } +}; + +struct BufferFetcherTextureHost { + bool init_succ; + BufferFetcherTexture val; + + BufferFetcherTextureHost(float* p, const size_t n); + + ~BufferFetcherTextureHost() { reset(); } + + void reset() { + if (init_succ) { + cuda_check(cudaDestroyTextureObject(val.tex)); + init_succ = false; + } + } +}; + +BufferFetcherTextureHost::BufferFetcherTextureHost(float* p, const size_t n) { + init_succ = false; + cudaTextureObject_t tex_obj; + + cudaResourceDesc res_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = static_cast(p); + res_desc.res.linear.sizeInBytes = n * sizeof(float); + res_desc.res.linear.desc = + cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaTextureDesc tex_desc; + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + if (cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL) == + cudaSuccess) { + val.tex = tex_obj; + init_succ = true; + } else { + cudaGetLastError(); // reset error + } +} + +template +struct KernelPtr { + typedef void (*type)(BufferFetcher, BufferFetcher, float*, uint32_t, + uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); +}; + +//! 1 -> 0xffffffff, 0 -> 0x00000000 +__device__ __forceinline__ uint32_t bool_as_mask(uint32_t cond) { + return (!cond) - 1u; +} + +union FloatAndU32 { + float f; + uint32_t u; +}; + +//! \p mask must be either all 1 or 0 bits +template +__device__ __forceinline__ float visit_with_mask(BufferFetcher buf, + uint32_t offset, + uint32_t mask) { + FloatAndU32 f; + f.f = buf.get(offset & mask); + f.u &= mask; + return f.f; +} + +template +__global__ void conv_kernel(BufferFetcher src, BufferFetcher filter, float* dst, + const uint32_t INP_BS, const uint32_t OUT_BS, + const uint32_t IC, const uint32_t IH, + const uint32_t IW, const uint32_t OC, + const uint32_t OH, const uint32_t OW, + const uint32_t FH, const uint32_t FW, + const uint32_t SH, const uint32_t SW, + const uint32_t PH, const uint32_t PW) { + const uint32_t BM = BY < BX ? BY : BX; + // BY*BX == 256 + // (OC) * (IC*FH*FW) * (OH*OW) + const uint32_t n = blockIdx.z; + const uint32_t tidx = threadIdx.x; + const uint32_t tidy = threadIdx.y; + const uint32_t posx = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t posy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t posx2 = posx << 2; + const uint32_t posy2 = posy << 2; + const uint32_t heightA = OC; + const uint32_t widthA = IC * FH * FW; + const uint32_t heightB = widthA; + const uint32_t widthB = OH * OW; + const uint32_t oh0 = (posx2 + 0) / OW * SH; + const uint32_t ow0 = (posx2 + 0) % OW * SW; + const uint32_t op0 = oh0 * IW + ow0; + const uint32_t oh1 = (posx2 + 1) / OW * SH; + const uint32_t ow1 = (posx2 + 1) % OW * SW; + const uint32_t op1 = oh1 * IW + ow1; + const uint32_t oh2 = (posx2 + 2) / OW * SH; + const uint32_t ow2 = (posx2 + 2) % OW * SW; + const uint32_t op2 = oh2 * IW + ow2; + const uint32_t oh3 = (posx2 + 3) / OW * SH; + const uint32_t ow3 = (posx2 + 3) % OW * SW; + const uint32_t op3 = oh3 * IW + ow3; + const uint32_t FP = FH * FW; + // OC % (BLOCK*4) == 0 + // IC*FH*FW % BLOCK == 0 + // OH*OW % (BLOCK*4) == 0 + __shared__ float4 localA[BY][BM]; + __shared__ float4 localB[BM][BX]; + uint32_t i = 0u; + uint32_t offsetA = posy2 * widthA + tidx; + uint32_t offsetB = n * INP_BS - PH * IW - PW; + float4 sum0 = {0.0f, 0.0f, 0.0f, 0.0f}, sum1 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum2 = {0.0f, 0.0f, 0.0f, 0.0f}, sum3 = {0.0f, 0.0f, 0.0f, 0.0f}; + uint32_t fh = tidy / FW % FH; + uint32_t fw = tidy % FW; + uint32_t ic = tidy / (FH * FW); + uint32_t icm = tidy % (FH * FW); + + const uint32_t fhs = BM / FW % FH; + const uint32_t fws = BM % FW; + const uint32_t ics = BM / (FH * FW); + const uint32_t icms = BM % (FH * FW); + + for (; i < widthA; i += BM, offsetA += BM) { + // load localA + if (tidx < BM) { + localA[tidy][tidx].x = filter.get(offsetA + 0 * widthA); + localA[tidy][tidx].y = filter.get(offsetA + 1 * widthA); + localA[tidy][tidx].z = filter.get(offsetA + 2 * widthA); + localA[tidy][tidx].w = filter.get(offsetA + 3 * widthA); + } + + // load localB + /* + const uint32_t fh_t = (tidy+i) / FW % FH; + const uint32_t fw_t = (tidy+i) % FW; + const uint32_t ic_t = (tidy+i) / (FH*FW); + if (fh != fh_t) printf("fh=%d, fh_t=%d\n", fh, fh_t); + if (fw != fw_t) printf("fw=%d, fw_t=%d\n", fw, fw_t); + if (ic != ic_t) printf("ic=%d, ic_t=%d\n", ic, ic_t); + */ + uint32_t fh2, fw2; + if (is_xcorr) { + fh2 = fh; + fw2 = fw; + } else { + fh2 = FH - fh - 1; + fw2 = FW - fw - 1; + } + + if (tidy < BM) { + uint32_t tmp = offsetB + (ic * IH + (fh2)) * IW + (fw2), + ok = bool_as_mask(tidy + i < heightB), + p0 = bool_as_mask(fh2 + oh0 >= PH && fh2 + oh0 < IH + PH && + fw2 + ow0 >= PW && fw2 + ow0 < IW + PW), + p1 = bool_as_mask(fh2 + oh1 >= PH && fh2 + oh1 < IH + PH && + fw2 + ow1 >= PW && fw2 + ow1 < IW + PW), + p2 = bool_as_mask(fh2 + oh2 >= PH && fh2 + oh2 < IH + PH && + fw2 + ow2 >= PW && fw2 + ow2 < IW + PW), + p3 = bool_as_mask(fh2 + oh3 >= PH && fh2 + oh3 < IH + PH && + fw2 + ow3 >= PW && fw2 + ow3 < IW + PW); + localB[tidy][tidx].x = visit_with_mask(src, tmp + op0, ok & p0); + localB[tidy][tidx].y = visit_with_mask(src, tmp + op1, ok & p1); + localB[tidy][tidx].z = visit_with_mask(src, tmp + op2, ok & p2); + localB[tidy][tidx].w = visit_with_mask(src, tmp + op3, ok & p3); + } + + __syncthreads(); + + for (uint32_t j = 0u; j < BM; ++j) { + float4 tmpA = localA[tidy][j]; + float4 tmpB = localB[j][tidx]; + sum0.x += tmpA.x * tmpB.x; + sum0.y += tmpA.x * tmpB.y; + sum0.z += tmpA.x * tmpB.z; + sum0.w += tmpA.x * tmpB.w; + sum1.x += tmpA.y * tmpB.x; + sum1.y += tmpA.y * tmpB.y; + sum1.z += tmpA.y * tmpB.z; + sum1.w += tmpA.y * tmpB.w; + sum2.x += tmpA.z * tmpB.x; + sum2.y += tmpA.z * tmpB.y; + sum2.z += tmpA.z * tmpB.z; + sum2.w += tmpA.z * tmpB.w; + sum3.x += tmpA.w * tmpB.x; + sum3.y += tmpA.w * tmpB.y; + sum3.z += tmpA.w * tmpB.z; + sum3.w += tmpA.w * tmpB.w; + } + + fw += fws; + fh += fhs; + fh += (fw >= FW); + fh -= (fh >= FH) * FH; + fw -= (fw >= FW) * FW; + + ic += ics; + icm += icms; + ic += (icm >= FP); + icm -= (icm >= FP) * FP; + __syncthreads(); + } + const uint32_t dst_idx = n * OUT_BS + posy2 * widthB + posx2; + bool y0 = (posy2 + 0 < heightA); + bool y1 = (posy2 + 1 < heightA); + bool y2 = (posy2 + 2 < heightA); + bool y3 = (posy2 + 3 < heightA); + bool x0 = (posx2 + 0 < widthB); + bool x1 = (posx2 + 1 < widthB); + bool x2 = (posx2 + 2 < widthB); + bool x3 = (posx2 + 3 < widthB); + if (y0) { + if (x0) + dst[dst_idx + 0 * widthB + 0] = sum0.x; + if (x1) + dst[dst_idx + 0 * widthB + 1] = sum0.y; + if (x2) + dst[dst_idx + 0 * widthB + 2] = sum0.z; + if (x3) + dst[dst_idx + 0 * widthB + 3] = sum0.w; + } + if (y1) { + if (x0) + dst[dst_idx + 1 * widthB + 0] = sum1.x; + if (x1) + dst[dst_idx + 1 * widthB + 1] = sum1.y; + if (x2) + dst[dst_idx + 1 * widthB + 2] = sum1.z; + if (x3) + dst[dst_idx + 1 * widthB + 3] = sum1.w; + } + if (y2) { + if (x0) + dst[dst_idx + 2 * widthB + 0] = sum2.x; + if (x1) + dst[dst_idx + 2 * widthB + 1] = sum2.y; + if (x2) + dst[dst_idx + 2 * widthB + 2] = sum2.z; + if (x3) + dst[dst_idx + 2 * widthB + 3] = sum2.w; + } + if (y3) { + if (x0) + dst[dst_idx + 3 * widthB + 0] = sum3.x; + if (x1) + dst[dst_idx + 3 * widthB + 1] = sum3.y; + if (x2) + dst[dst_idx + 3 * widthB + 2] = sum3.z; + if (x3) + dst[dst_idx + 3 * widthB + 3] = sum3.w; + } +} + +} // anonymous namespace + +void conv_bias::exec_inplace_matmul_fwd( + const float* src, const float* filter, float* dst, size_t N, + size_t INP_BS, size_t OUT_BS, size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, size_t FH, size_t FW, size_t PH, + size_t PW, size_t SH, size_t SW, bool is_xcorr, cudaStream_t stream) { + BufferFetcherTextureHost src_tex(const_cast(src), N * INP_BS), + filter_tex(const_cast(filter), OC * IC * FH * FW); + + BufferFetcherRaw src_buf, filter_buf; + src_buf.ptr = src; + filter_buf.ptr = filter; + if (!src_tex.init_succ || !filter_tex.init_succ) { + src_tex.reset(); + filter_tex.reset(); + } + int m = OC; + int n = OH * OW; + int BY = 1; + int BX = 1; + if (m <= 64) { + while (BY < 16 && (BY << 2) < m) + BY <<= 1; + BX = 256 / BY; + } else if (n <= 64) { + while (BX < 16 && (BX << 2) < n) + BX <<= 1; + BY = 256 / BX; + } else { + BX = BY = 16; + } + dim3 blocks((OH * OW + BX * 4 - 1) / (BX * 4), (OC + BY * 4 - 1) / (BY * 4), + N); + dim3 threads(BX, BY); +#define DISPATCH_BX_BY(BX, BY) \ + do { \ + if (src_tex.init_succ) { \ + KernelPtr::type kptr; \ + if (is_xcorr) { \ + kptr = conv_kernel; \ + } else { \ + kptr = conv_kernel; \ + } \ + kptr<<>>( \ + src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \ + IW, OC, OH, OW, FH, FW, SH, SW, PH, PW); \ + } else { \ + KernelPtr::type kptr; \ + if (is_xcorr) { \ + kptr = conv_kernel; \ + } else { \ + kptr = conv_kernel; \ + } \ + kptr<<>>( \ + src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \ + OH, OW, FH, FW, SH, SW, PH, PW); \ + } \ + } while (0) +#define DISPATCH_BX(BX) \ + do { \ + DISPATCH_BX_BY(BX, 256 / BX); \ + } while (0) +#define DISPATCH() \ + do { \ + switch (BX) { \ + case 1: \ + DISPATCH_BX(1); \ + break; \ + case 2: \ + DISPATCH_BX(2); \ + break; \ + case 4: \ + DISPATCH_BX(4); \ + break; \ + case 8: \ + DISPATCH_BX(8); \ + break; \ + case 16: \ + DISPATCH_BX(16); \ + break; \ + case 32: \ + DISPATCH_BX(32); \ + break; \ + case 64: \ + DISPATCH_BX(64); \ + break; \ + case 128: \ + DISPATCH_BX(128); \ + break; \ + case 256: \ + DISPATCH_BX(256); \ + break; \ + default: \ + report_error("no usable kernel"); \ + } \ + } while (0) + DISPATCH(); +#undef DISPATCH +#undef DISPATCH_BX +#undef DISPATCH_BX_BY + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh new file mode 100644 index 00000000..a9a98ba2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh @@ -0,0 +1,32 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace conv_bias { + +void exec_inplace_matmul_fwd(const float* src, const float* filter, float* dst, + size_t N, size_t INP_BS, size_t OUT_BS, size_t IC, + size_t IH, size_t IW, size_t OC, size_t OH, + size_t OW, size_t FH, size_t FW, size_t PH, + size_t PW, size_t SH, size_t SW, bool is_xcorr, + cudaStream_t stream); + +} // namespace conv_bias +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp b/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp new file mode 100644 index 00000000..d243924f --- /dev/null +++ b/dnn/src/cuda/conv_bias/matmul_8x8x32.cpp @@ -0,0 +1,301 @@ +/** + * \file dnn/src/cuda/conv_bias/matmul_8x8x32.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/common/conv_bias.h" +#include "src/cuda/utils.h" +#include "src/cuda/utils.cuh" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/matmul/im2col_nhwc_int8.cuh" + +using namespace megdnn; +using namespace cuda; + +bool ConvBiasForwardImpl::AlgoMatmul8x8x32::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + if (cuda::current_device_prop().major < 6) + return false; + + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + } + + using NonlineMode = param::ConvBias::NonlineMode; + auto&& fm = args.filter_meta; + bool available = + (args.nonlinear_mode == NonlineMode::IDENTITY || + args.nonlinear_mode == NonlineMode::RELU) && + ((args.src_layout->dtype == dtype::Int8() && + dst_layout.dtype == dtype::Int32() && + fm.dtype.enumv() == DTypeEnum::Int8) || + (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 && + dst_layout.dtype.enumv() == DTypeEnum::QuantizedS32)) && + fm.group == 1 && fm.spatial_ndim == 2 && + (fm.format == Param::Format::NHWC || + fm.format == Param::Format::NCHW4); + return available; +}; + +template +WorkspaceBundle ConvBiasForwardImpl::AlgoMatmul8x8x32::get_bundle( + const SizeArgs& args) const { + size_t src_unroll_part, filter_reshape_part; + size_t relayout_src_part = 0, relayout_filter_part = 0, + relayout_dst_part = 0; + auto&& fm = args.filter_meta; + size_t n, ih, iw, oh, ow, fh, fw, ic, oc; + n = args.dst_layout->shape[0]; + fh = fm.spatial[0]; + fw = fm.spatial[1]; + if (format == Param::Format::NHWC) { + oh = args.dst_layout->shape[1]; + ow = args.dst_layout->shape[2]; + ic = args.src_layout->shape[3]; + oc = args.dst_layout->shape[3]; + } else { + // NCHW4 + ic = args.src_layout->shape[1] * 4; + ih = args.src_layout->shape[2]; + iw = args.src_layout->shape[3]; + oc = args.dst_layout->shape[1] * 4; + oh = args.dst_layout->shape[2]; + ow = args.dst_layout->shape[3]; + + relayout_src_part = n * ic * ih * iw * sizeof(int8_t); + relayout_filter_part = ic * oc * fh * fw * sizeof(int8_t); + relayout_dst_part = n * oc * oh * ow * sizeof(int32_t); + } + // short for ``leading dimension'' + size_t ld = (fh * fw * ic + 3) & ~3; + if (need_src_unroll(args)) { + src_unroll_part = n * oh * ow * ld * sizeof(int8_t); + } else { + src_unroll_part = 0; + } + if (need_filter_reshape(args)) { + filter_reshape_part = oc * ld * sizeof(int8_t); + } else { + filter_reshape_part = 0; + } + + SmallVector sizes = {src_unroll_part, filter_reshape_part, + relayout_src_part, relayout_filter_part, + relayout_dst_part}; + + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + sizes.push_back(dst_layout.span().dist_byte()); + } + + return WorkspaceBundle(nullptr, sizes); +} + +size_t ConvBiasForwardImpl::AlgoMatmul8x8x32::get_workspace_in_bytes( + const SizeArgs& args) const { + if (args.filter_meta.format == Param::Format::NHWC) { + auto bundle = get_bundle(args); + return bundle.total_size_in_bytes(); + } else { + // NCHW4 + auto bundle = get_bundle(args); + return bundle.total_size_in_bytes(); + } +} + +template +void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec_internal( + const ExecArgs& args) const { + auto stream = args.handle->stream(); + auto cublas_handle = args.handle->cublas_handle(); + auto alpha = args.handle->one_device_i32(); + auto beta = args.handle->zero_device_i32(); + auto&& fm = args.filter_meta; + auto bundle = get_bundle(args); + bundle.set(args.workspace.raw_ptr); + + TensorND src_tensor, dst_tensor, filter_tensor; + if (format == Param::Format::NHWC) { + src_tensor = *args.src_tensor; + dst_tensor = *args.dst_tensor; + filter_tensor = *args.filter_tensor; + } else { + // NCHW4 + auto to_nhwc = [](const TensorLayout& layout, + void* raw_ptr) -> TensorND { + return {raw_ptr, + {{layout[0], layout[2], layout[3], layout[1] * 4}, + layout.dtype}}; + }; + src_tensor = to_nhwc(*args.src_layout, bundle.get(2)); + filter_tensor = to_nhwc(args.filter_tensor->layout, bundle.get(3)); + dst_tensor = to_nhwc(*args.dst_layout, bundle.get(4)); + + auto relayout = [&](const TensorND& src, void* dst_ptr) { + auto N = src.layout[0], C = src.layout[1] * 4, H = src.layout[2], + W = src.layout[3]; + args.handle->relayout_opr()->exec( + {src.raw_ptr, + TensorLayout{{N, H, W, C / 4, 4}, + { + src.layout.stride[0], + src.layout.stride[2], + src.layout.stride[3], + src.layout.stride[1], + src.layout.stride[4] + }, + src.layout.dtype}}, + {dst_ptr, + TensorLayout{{N, H, W, C / 4, 4}, src.layout.dtype}}); + }; + relayout(*args.src_tensor, src_tensor.raw_ptr); + relayout(*args.filter_tensor, filter_tensor.raw_ptr); + } + + size_t N, IH, IW, IC; + N = src_tensor.layout.shape[0]; + IH = src_tensor.layout.shape[1]; + IW = src_tensor.layout.shape[2]; + IC = src_tensor.layout.shape[3]; + + auto IWS = src_tensor.layout.stride[2]; + auto FH = fm.spatial[0], FW = fm.spatial[1]; + auto OH = dst_tensor.layout.shape[1], OW = dst_tensor.layout.shape[2], + OC = dst_tensor.layout.shape[3]; + auto OWS = dst_tensor.layout.stride[2]; + auto PH = fm.padding[0], PW = fm.padding[1]; + auto SH = fm.stride[0], SW = fm.stride[1]; + auto DH = fm.dilation[0], DW = fm.dilation[1]; + auto LD = (FH * FW * IC + 3) & ~3; + + int8_t *inp0 = nullptr, *inp1 = nullptr; + ptrdiff_t inp0_stride = 0, inp1_stride = 0; + + if (need_src_unroll(args)) { + inp0 = static_cast(bundle.get(0)); + inp0_stride = LD; + im2col_nhwc_int8(src_tensor.compatible_ptr(), inp0, N, IH, IW, + IC, IWS, OH, OW, OC, OWS, FH, FW, PH, PW, SH, SW, DH, + DW, LD, fm.should_flip, stream); + } else { + inp0 = src_tensor.compatible_ptr(); + inp0_stride = IWS; + } + if (need_filter_reshape(args)) { + // copy (OC, FH*FW*IC) to (OC, FH*FW*IC) with stride=LD + inp1 = static_cast(bundle.get(1)); + cuda_check(cudaMemcpy2DAsync( + inp1, LD * sizeof(int8_t), filter_tensor.raw_ptr, + FH * FW * IC * sizeof(int8_t), FH * FW * IC * sizeof(int8_t), + OC, cudaMemcpyDeviceToDevice, stream)); + inp1_stride = LD; + } else { + inp1 = filter_tensor.compatible_ptr(); + inp1_stride = FH * FW * IC; + } + cublas_check(cublasGemmEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, OC, + N * OH * OW, FH * FW * IC, alpha, inp1, CUDA_R_8I, + inp1_stride, inp0, CUDA_R_8I, inp0_stride, beta, + dst_tensor.compatible_ptr(), CUDA_R_32I, + OWS, CUDA_R_32I, CUBLAS_GEMM_DFALT)); + + if (format == Param::Format::NCHW4) { + args.handle->relayout_opr()->exec( + {dst_tensor.compatible_ptr(), + TensorLayout{{N, OC / 4, OH, OW, 4}, + {static_cast(OH * OW * OC), 4, + static_cast(OC * OW), + static_cast(OC), 1}, + dst_tensor.layout.dtype}}, + *args.dst_tensor); + } +} + +void ConvBiasForwardImpl::AlgoMatmul8x8x32::exec(const ExecArgs& args) const { + ExecArgs conv_args = args; + auto conv_dst_tensor = *args.dst_tensor; + if (args.filter_meta.format == Param::Format::NHWC) { + auto bundle = get_bundle(args); + bundle.set(args.workspace.raw_ptr); + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + conv_args.dst_tensor = &conv_dst_tensor; + conv_args.dst_layout = &conv_dst_tensor.layout; + } else { + auto bundle = get_bundle(args); + bundle.set(args.workspace.raw_ptr); + if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { + conv_dst_tensor.raw_ptr = bundle.get(bundle.nr_workspace() - 1); + conv_dst_tensor.layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + conv_dst_tensor.layout.dtype); + } + conv_args.dst_tensor = &conv_dst_tensor; + conv_args.dst_layout = &conv_dst_tensor.layout; + } + + if (args.filter_meta.format == Param::Format::NHWC) { + exec_internal(conv_args); + } else { + // NCHW4 + exec_internal(conv_args); + } + handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, + &conv_dst_tensor, args.dst_tensor, + args.bias_tensor); +} + +bool ConvBiasForwardImpl::AlgoMatmul8x8x32::need_filter_reshape( + const SizeArgs& args) const { + // cublasGemmEx requires the stride of the filter matrix to be multiples + // of 4. + auto&& fm = args.filter_meta; + size_t ic; + if (args.filter_meta.format == Param::Format::NHWC) { + ic = args.src_layout->shape[3]; + } else { + // NCHW4 + ic = args.src_layout->shape[1] * 4; + } + return !(ic * fm.spatial[0] * fm.spatial[1] % 4 == 0); +} + +bool ConvBiasForwardImpl::AlgoMatmul8x8x32::need_src_unroll( + const SizeArgs& args) const { + // cublasGemmEx requires the stride of the unrolled src to be multiples + // of 4. + size_t stride; + if (args.filter_meta.format == Param::Format::NHWC) { + stride = args.src_layout->stride[2]; + } else { + // NCHW4 + stride = args.src_layout->shape[1] * 4; + } + + auto&& fm = args.filter_meta; + return !(fm.spatial[0] == 1 && fm.spatial[1] == 1 && fm.stride[0] == 1 && + fm.stride[1] == 1 && fm.padding[0] == 0 && fm.padding[1] == 0 && + stride % 4 == 0); +} +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp new file mode 100644 index 00000000..79ae71fc --- /dev/null +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -0,0 +1,207 @@ +/** + * \file dnn/src/cuda/conv_bias/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/conv_bias/opr_impl.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include "src/common/algo_chooser.h" + +#include "src/cuda/cudnn_with_check.h" + +namespace megdnn { +namespace cuda { + +void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, + workspace.size); + AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); + auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, + z.layout, dst.layout); + algo->check_workspace(args, workspace).exec(args); +}; + +std::vector +ConvBiasForwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) { + return megdnn::get_all_algorithms( + {this, src, filter, bias, z, dst}); +} + +ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_limit_in_bytes, + bool reproducible) { + using namespace conv_bias; + AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; + auto dst_layout = *args.dst_layout; + if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { + dst_layout.dtype = DType(); + args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, + args.filter_layout->dtype, + dst_layout.dtype); + } + auto conv_args = args; + + auto cudnn_conv_bias_act_from_enum_wrapper = + [this](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* { + return sm_algo_pack.cudnn_conv_bias_act_from_enum(algo); + }; + + auto cudnn_conv_from_enum_wrapper = + [this](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* { + return sm_algo_pack.cudnn_conv_from_enum(algo); + }; + + auto get_cudnn_algo = + [this, &conv_args, &args, workspace_limit_in_bytes, reproducible]( + const thin_function& + cb) -> AlgoBase* { + auto cudnn_handle = cuda::cudnn_handle(this->handle()); + CUDNNForwardDescs desc; + conv_args.init_conv_desc(desc); +#if CUDNN_MAJOR >= 7 + int max_count = 0; + cudnn_check(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, + &max_count)); + SmallVector algo_perf(max_count); + int ret_count = 0; + cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7( + cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, + desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count, + &ret_count, algo_perf.data())); + for (int i = 0; i < ret_count; ++i) { + auto conv_bias_algo = cb(algo_perf[i].algo); + if (conv_bias_algo->is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) + return conv_bias_algo; + } +#else + cudnnConvolutionFwdAlgo_t algo; + cudnn_check(cudnnGetConvolutionForwardAlgorithm( + cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, + desc.conv_desc.conv_desc, desc.dst_desc.desc, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_in_bytes, &algo)); + + auto conv_bias_algo = cb(algo); + if (conv_bias_algo->is_available_reproducible(args, reproducible, + workspace_limit_in_bytes)) + return conv_bias_algo; +#endif + return nullptr; + }; + + auto get_1x1_algo = [workspace_limit_in_bytes, + reproducible](const AlgoBase::SizeArgs& size_arg) + -> ConvBiasForwardImpl::AlgoBase* { + if (sm_algo_pack.batched_matmul.is_available_reproducible( + size_arg, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.batched_matmul; + } else if (sm_algo_pack.a1x1.is_available_reproducible( + size_arg, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.a1x1; + } + return nullptr; + }; + + //! Prefer CUDNN CONVBIAS. + bool cudnn_conv_bias_act_supported = false; + for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) { + if (algo.is_available_reproducible(args, reproducible, + workspace_limit_in_bytes)) { + cudnn_conv_bias_act_supported = true; + break; + } + } + + if (cudnn_conv_bias_act_supported) { + if (auto algo = get_cudnn_algo(cudnn_conv_bias_act_from_enum_wrapper)) + return algo; + } + + if (args.filter_meta.group > 1) { +#if CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5) + // prefer special chanwise impl since as the group conv of cudnn whose + // version is lower than v7.5.0 is still slower than our implementation + // in many channel-wise cases + if (sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) + return &sm_algo_pack.chanwise; + if (sm_algo_pack.chanwise8x8x32.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) + return &sm_algo_pack.chanwise8x8x32; +#endif + } + + if (auto algo = get_1x1_algo(args)) { + return algo; + } + + // modify conv_args dst_layout + conv_args.dst_layout = &dst_layout; + if (is_cudnn_supported(conv_args)) { + if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) + return algo; + } + + if (args.filter_meta.group > 1) { + auto orig_args = conv_args; + TensorLayout src, dst, bias; + AlgoGroupConvGeneral::modify_size_args(conv_args, src, dst, bias); + if (auto algo = get_1x1_algo(conv_args)) { + return sm_algo_pack.algo2gconv.at(algo); + } + if (is_cudnn_supported(conv_args)) { + if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { + return sm_algo_pack.algo2gconv.at(algo); + } + } + conv_args = orig_args; + } + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda convbias fwd"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda convbias fwd"); + } +} + +const char* ConvBiasForwardImpl::get_algorithm_set_name() const { + return "CONV_BIAS_CUDA"; +} + +size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) { + AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; + return get_algorithm(this, src, filter, bias, z, dst) + ->get_workspace_in_bytes(args); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h new file mode 100644 index 00000000..4efc46b3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -0,0 +1,72 @@ +/** + * \file dnn/src/cuda/conv_bias/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "../elemwise/opr_impl.h" +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class ConvBiasForwardImpl : public ConvBiasForward { +public: + using ConvBiasForward::ConvBiasForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_in z, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, + const TensorLayout&, const TensorLayout&, + const TensorLayout&) override; + + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoCUDNNConvBiasActivation; + class AlgoChanwise; + class AlgoChanwiseSmall; + class AlgoChanwise8x8x32; + class AlgoCUDNNConv; + class AlgoInplaceMatmul; + class AlgoMatmul; + class AlgoMatmul8x8x32; + class Algo1x1; + class AlgoBatchedMatmul; + class AlgoGroupConvGeneral; + class AlgoQUInt4x4x32WMMA; + class AlgoInt8CHWN4DotProdImplicitGemm; + class AlgoInt8NCHW4DotProdImplicitGemm; + class AlgoInt8CHWN4IMMAImplicitGemm; + class AlgoInt8NCHW4IMMAImplicitGemm; + class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; + class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp new file mode 100644 index 00000000..1210f7eb --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp @@ -0,0 +1,189 @@ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include "./quint4x4x32_wmma/activation_u4.cuh" +#include "./quint4x4x32_wmma/reduce_with_scale_data.cuh" +#include "./quint4x4x32_wmma/reduce_with_scale_filter.cuh" +#include "./quint4x4x32_wmma/wmma_conv_integer_u4.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace activation_u4; + +#if CUDA_VERSION >= 10000 +bool ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::is_available( + const SizeArgs& args) const { + if (args.z_layout->ndim > 0) + return false; + + bool available = true; + auto&& filter_meta = args.filter_meta; + // FH, FW must be 3, 5, 7 + available &= (filter_meta.spatial[0] == 3 && filter_meta.spatial[1] == 3) || + (filter_meta.spatial[0] == 5 && filter_meta.spatial[1] == 5) || + (filter_meta.spatial[0] == 7 && filter_meta.spatial[1] == 7); + // stride must be 1 + available &= (filter_meta.stride[0] == 1 && filter_meta.stride[1] == 1); + // OW must be a multiple of 8 + available &= (args.dst_layout->operator[](3) % 8 == 0); + // only support dense conv + auto&& param = args.opr->param(); + using Param = param::ConvBias; + available &= (param.sparse == Param::Sparse::DENSE); + // only support cross correlation convolution + available &= (!args.filter_meta.should_flip); + // dilate should be 1 + available &= (filter_meta.dilation[0] == 1 && filter_meta.dilation[1] == 1); + // format should be NCHW8 + available &= (param.format == Param::Format::NCHW8); + // device support sm_75 + auto&& device_prop = current_device_prop(); + available &= (device_prop.major > 7 || + (device_prop.major == 7 && device_prop.minor >= 5)); + // nonlinmode should be RELU or Identity + available &= param.nonlineMode == Param::NonlineMode::RELU || + param.nonlineMode == Param::NonlineMode::IDENTITY; + // IC should be a multiple of 32 + available &= (args.src_layout->operator[](1) * 8) % 32 == 0; + return available; +} + +WorkspaceBundle ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + // ws_size_zp_filter = OC + size_t N = args.src_layout->operator[](0); + size_t OC = args.filter_layout->operator[](0), + IC = args.filter_layout->operator[](1) * 8, + FH = args.filter_layout->operator[](2), + FW = args.filter_layout->operator[](3); + size_t OH = args.dst_layout->operator[](2), + OW = args.dst_layout->operator[](3); + + size_t ws_size_zp_filter = OC * sizeof(int32_t); + // for reduce filter + { + size_t A = OC, B = IC * FH * FW / 8, C = 1; + ws_size_zp_filter += _do_dispatch_reduce_workspace_in_bytes(A, B, C); + } + size_t ws_size_zp_data = N * OH * OW * sizeof(int32_t); + size_t ws_size_relayout_filter = get_workspace_in_bytes_do_conv(args); + if (ws_size_relayout_filter > 0) { + WorkspaceBundle ws{ + raw_ptr, + {ws_size_zp_filter, ws_size_zp_data, ws_size_relayout_filter}}; + return ws; + } + WorkspaceBundle ws{raw_ptr, {ws_size_zp_filter, ws_size_zp_data}}; + return ws; +} + +size_t ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +bool ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::use_kernel_fhxfw( + const SizeArgs& args) const { + return (args.filter_meta.spatial[0] == 3 && + args.filter_meta.spatial[1] == 3); +} + +size_t ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::get_workspace_in_bytes_do_conv( + const SizeArgs& args) const { + if (use_kernel_fhxfw(args)) + return 0_z; + size_t OC = args.filter_layout->operator[](0), + IC = args.filter_layout->operator[](1) * 8, + FH = args.filter_layout->operator[](2), + FW = args.filter_layout->operator[](3); + return OC * IC * FH * FW / 2; +} + +void ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA::exec( + const ExecArgs& args) const { + auto&& handle = concrete_handle(args.opr->handle()); + auto&& ws_bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + auto&& ws_zp_filter = ws_bundle.get_workspace(0); + auto&& ws_zp_data = ws_bundle.get_workspace(1); + size_t N = args.src_layout->operator[](0), + IC = args.src_layout->operator[](1) * 8, + IH = args.src_layout->operator[](2), + IW = args.src_layout->operator[](3), + OC = args.filter_layout->operator[](0), + FH = args.filter_meta.spatial[0], FW = args.filter_meta.spatial[1], + OH = args.dst_layout->operator[](2), + OW = args.dst_layout->operator[](3), + PH = args.filter_meta.padding[0], PW = args.filter_meta.padding[1], + SH = args.filter_meta.stride[0], SW = args.filter_meta.stride[1]; + int32_t zp_data = + args.src_layout->dtype.param().zero_point; + int32_t zp_filter = + args.filter_layout->dtype.param() + .zero_point; + int32_t zp_data_filter = zp_data * zp_filter * FH * FW * IC; + auto&& stream = cuda_stream(handle); + // zp filter + _do_dispatch_reduce_with_scale_filter_u4( + static_cast(args.filter_tensor->raw_ptr), -zp_data, OC, + FH * FW * IC / 8, ws_zp_filter.ptr(), stream); + // zp data + _do_dispatch_reduce_with_scale_data_u4( + ws_zp_data.ptr(), + static_cast(args.src_tensor->raw_ptr), N, IH, IW, OH, OW, + PH, PW, FH, FW, SH, SW, IC, -zp_filter, + static_cast(zp_data), stream); + + // do conv + if (use_kernel_fhxfw(args)) { + wmma_conv_integer_subbyte::_do_wmma_conv_integer_subbyte_fhxfw( + static_cast(args.src_tensor->raw_ptr), + static_cast(args.filter_tensor->raw_ptr), + args.dst_tensor->compatible_ptr(), N, IH, IW, OH, OW, + PH, PW, IC, OC, FH, FW, SH, SW, static_cast(zp_data), + stream); + } else { + auto&& ws_relayout_filter = ws_bundle.get_workspace(2); + wmma_conv_integer_subbyte::_do_wmma_conv_integer_subbyte_1xfw( + static_cast(args.src_tensor->raw_ptr), + static_cast(args.filter_tensor->raw_ptr), + args.dst_tensor->compatible_ptr(), + ws_relayout_filter.ptr(), N, IH, IW, OH, OW, PH, PW, + IC, OC, FH, FW, SH, SW, static_cast(zp_data), stream); + } + // do activation + int s0 = args.bias_layout->stride[0], s1 = args.bias_layout->stride[1], + s2 = args.bias_layout->stride[2], s3 = args.bias_layout->stride[3]; + s0 = args.bias_layout->shape[0] == 1 ? 0 : s0; + s1 = args.bias_layout->shape[1] == 1 ? 0 : s1; + s2 = args.bias_layout->shape[2] == 1 ? 0 : s2; + s3 = args.bias_layout->shape[3] == 1 ? 0 : s3; + activation_u4::BiasVisitor visitor{ + args.bias_tensor->compatible_ptr(), s0, s1, s2, s3}; + auto&& param = args.opr->param(); + if (param.nonlineMode == Param::NonlineMode::RELU) { + _do_dispatch_activation_u4( + args.dst_tensor->compatible_ptr(), visitor, + ws_zp_data.ptr(), ws_zp_filter.ptr(), + zp_data_filter, N, OC, OH, OW, stream); + } else if (param.nonlineMode == Param::NonlineMode::IDENTITY) { + _do_dispatch_activation_u4( + args.dst_tensor->compatible_ptr(), visitor, + ws_zp_data.ptr(), ws_zp_filter.ptr(), + zp_data_filter, N, OC, OH, OW, stream); + } +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cpp b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cpp new file mode 100644 index 00000000..cdca5682 --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cpp @@ -0,0 +1,72 @@ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + */ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "src/cuda/utils.h" +#include "src/cuda/query_blocksize.cuh" + +namespace megdnn { +namespace cuda { +namespace activation_u4 { +/* + * \note: The following code copied from TensorFlow. Used for calculating the + * Cuda 3D launch config to ensure maximize occupancy we should use for a kernel + * launch. + */ +void get_launch_config(const void* kern, int dimx, int dimy, int dimz, + dim3& blocks, dim3& grids) { + auto config = + query_launch_config_for_kernel(reinterpret_cast(kern)); + int block_size = config.block_size; + int grid_size = config.grid_size; + auto&& device_prop = current_device_prop(); + int x_thread_limit = device_prop.maxThreadsDim[0]; + int y_thread_limit = device_prop.maxThreadsDim[1]; + int z_thread_limit = device_prop.maxThreadsDim[2]; + int x_grid_limit = device_prop.maxGridSize[0]; + int y_grid_limit = device_prop.maxGridSize[1]; + int z_grid_limit = device_prop.maxGridSize[2]; +#define MIN3(a, b, c) std::min({(a), (b), (c)}) + uint32_t blkx = MIN3(dimx, block_size, x_thread_limit); + uint32_t blky = + MIN3(dimy, std::max(block_size / (int)(blkx), 1), y_thread_limit); + uint32_t blkz = + MIN3(dimz, std::max(block_size / ((int)blkx * (int)blky), 1), + z_thread_limit); + uint32_t gridx = MIN3(grid_size, DIVUP((int)dimx, (int)blkx), x_grid_limit); + uint32_t gridy = MIN3(DIVUP(grid_size, (int)gridx), DIVUP(dimy, (int)blky), + y_grid_limit); + uint32_t gridz = MIN3(DIVUP(grid_size, (int)(gridx * gridy)), + DIVUP(dimz, (int)blkz), z_grid_limit); +#undef MIN3 + + grids = dim3{gridx, gridy, gridz}; + blocks = dim3{blkx, blky, blkz}; +} +} // namespace activation_u4 +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu new file mode 100644 index 00000000..500c653b --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu @@ -0,0 +1,119 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include "./activation_u4.cuh" + +namespace megdnn { +namespace cuda { +using namespace activation_u4; + +namespace { + +__host__ __device__ __forceinline__ int4 operator+(int4 lval, int4 rval) { + return make_int4(lval.x + rval.x, lval.y + rval.y, lval.z + rval.z, + lval.w + rval.w); +} + +template +__global__ void kern_activation_u4(int32_t* dst, const int32_t* zp_data, + const int32_t* zp_filter, + int32_t zp_data_filter, int batch_size, + int OC, int OH, int OW, + BiasVisitor visitor) { + const int ow = blockIdx.x * blockDim.x + threadIdx.x; + const int oh = blockIdx.y * blockDim.y + threadIdx.y; + const int bc = blockIdx.z * blockDim.z + threadIdx.z; + constexpr int subbytes_per_pixel = 8; + constexpr int load_width = 4; + const int oc_blks = OC / subbytes_per_pixel; + const int batch = bc / oc_blks; + const int oc_blk = bc % oc_blks; + + int32_t* dptr = dst + batch * OC * OH * OW + + oc_blk * OH * OW * subbytes_per_pixel + + oh * OW * subbytes_per_pixel + ow * subbytes_per_pixel; + if (batch >= batch_size || oh >= OH || ow >= OW) + return; + int32_t zp_data_val = zp_data[batch * OH * OW + oh * OW + ow]; + int32_t scalar = zp_data_val + zp_data_filter; + int4 scalar4 = make_int4(scalar, scalar, scalar, scalar); +#pragma unroll + for (int i = 0; i < subbytes_per_pixel / load_width; i++) { + // do 128 bit load + int4 zp_filter_val = *reinterpret_cast( + zp_filter + oc_blk * subbytes_per_pixel + i * load_width); + int4 bias_val = *reinterpret_cast( + visitor.ptr(batch, oc_blk, oh, ow, i * load_width)); + int4 dst_val = *(reinterpret_cast(dptr)); + int4 ret = dst_val + zp_filter_val + bias_val + scalar4; + *(reinterpret_cast(dptr)) = ActivationOp::apply(ret); + dptr += load_width; + } +} + +} // namespace + +template +void _do_dispatch_activation_u4(int32_t* dst, BiasVisitor visitor, + const int32_t* zp_data, + const int32_t* zp_filter, + int32_t zp_data_filter, int batch_size, int co, + int ho, int wo, cudaStream_t stream) { + void (*fptr)(int32_t*, const int32_t*, const int32_t*, int32_t, int, int OC, + int, int, BiasVisitor) = kern_activation_u4; + dim3 grids{0, 0, 0}; + dim3 blocks{0, 0, 0}; + get_launch_config(reinterpret_cast(fptr), wo, ho, + batch_size * co / 8, blocks, grids); + kern_activation_u4<<>>( + dst, zp_data, zp_filter, zp_data_filter, batch_size, co, ho, wo, + visitor); + after_kernel_launch(); +} + +#define INST(_op) \ + template void _do_dispatch_activation_u4<_op>( \ + int32_t * dst, BiasVisitor visitor, const int32_t* zp_data, \ + const int32_t* zp_filter, int32_t zp_data_filter, int batch_size, \ + int co, int ho, int wo, cudaStream_t stream); + +INST(ActivationRELU); +INST(ActivationIdentity); + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh new file mode 100644 index 00000000..6b0749dd --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh @@ -0,0 +1,95 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/activation_u4.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace activation_u4 { + +void get_launch_config(const void* kern, int dimx, int dimy, int dimz, + dim3& blocks, dim3& grids); + +struct BiasVisitor { + const int32_t* bias_ptr; + int batch_stride; + int channel_stride; + int height_stride; + int width_stride; +#ifdef MEGDNN_CC_CUDA + __host__ __device__ __forceinline__ const int32_t* ptr(int batch, + int oc_blk, int oh, + int ow, + int oc_remain) { + return bias_ptr + batch * batch_stride + oc_blk * channel_stride + + oh * height_stride + ow * width_stride + oc_remain; + } +#endif +}; + +struct ActivationRELU { +#ifdef MEGDNN_CC_CUDA + __host__ __device__ __forceinline__ static int4 apply(int4 in) { + int4 ret; + ret.x = in.x <= 0 ? 0 : in.x; + ret.y = in.y <= 0 ? 0 : in.y; + ret.z = in.z <= 0 ? 0 : in.z; + ret.w = in.w <= 0 ? 0 : in.w; + return ret; + } +#endif +}; + +struct ActivationIdentity { +#ifdef MEGDNN_CC_CUDA + __host__ __device__ __forceinline__ static int4 apply(int4 in) { + return in; + } +#endif +}; +} // namespace activation_u4 + +template +void _do_dispatch_activation_u4(int32_t* dst, + activation_u4::BiasVisitor visitor, + const int32_t* zp_data, + const int32_t* zp_filter, + int32_t zp_data_filter, int batch_size, int co, + int ho, int wo, cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu new file mode 100644 index 00000000..1ee6d9f8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu @@ -0,0 +1,696 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./reduce_with_scale_data.cuh" +#include "./wmma_conv_integer_u4.cuh" +#include "src/cuda/cub/util_ptx.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace wmma_conv_integer_subbyte; + +namespace { + +template +struct TileCounter { + MEGDNN_STATIC_ASSERT(thread_blk_x % WARP_SIZE == 0, + "thread block size in dim x not divided by warpSize"); + static const size_t spatial_tile_x = thread_blk_x * pixels_per_thread_x; + static const size_t spatial_tile_y = thread_blk_y * pixels_per_thread_y; + static const size_t global_load_tile_x = + (spatial_tile_x - 1) * ConvConfig::SW + ConvConfig::FW; + static const size_t global_load_tile_y = + (spatial_tile_y - 1) * ConvConfig::SH + ConvConfig::FH; + static const size_t reg_cache_x = + (global_load_tile_x + WARP_SIZE - 1) / WARP_SIZE; + static const size_t warps_per_block = + (thread_blk_x * thread_blk_y) / WARP_SIZE; + static const size_t reg_cache_y = + (global_load_tile_y + warps_per_block - 1) / warps_per_block; + static const size_t smem_stride = + global_load_tile_x + (global_load_tile_x % 2 == 0); +}; + +template +__global__ void reduce_in_spatial_block_and_along_input_channel_with_scale_u4( + int32_t* __restrict__ dst, const uint8_t* __restrict__ src, int IC, + int IH, int IW, int OH, int OW, int PH, int PW, int32_t scale, + int32_t zero) { + typedef TileCounter + TileCounter_; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int oh_start = bidy * TileCounter_::spatial_tile_y; + const int ow_start = bidx * TileCounter_::spatial_tile_x; + const int ih_base = oh_start * ConvConfig_::SH - PH; + const int iw_base = ow_start * ConvConfig_::SW - PW; + const uint8_t* __restrict__ sptr = + src + bidz * IC * IH * IW / 2 + (ih_base * IW + iw_base) * 4; + + __shared__ uint8_t smem[TileCounter_::global_load_tile_y] + [TileCounter_::smem_stride * 4]; + uint32_t reg_cache[TileCounter_::reg_cache_y][TileCounter_::reg_cache_x]; + int32_t acc[pixels_per_thread_y][pixels_per_thread_x]; + int32_t* __restrict__ dptr = + dst + bidz * OH * OW + ow_start + oh_start * OW; + + const int tid = tidy * thread_blk_x + tidx; + const int idx_in_warp = tid % WARP_SIZE; + const int warp_id = tid / WARP_SIZE; + +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + acc[i][j] = 0; + } + } + +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int iw = idx_in_warp + j * WARP_SIZE; + int ih = warp_id + i * TileCounter_::warps_per_block; + if (ih_base + ih >= 0 && ih_base + ih < IH && iw_base + iw >= 0 && + iw_base + iw < IW) { + reg_cache[i][j] = *(const uint32_t*)(&sptr[(ih * IW + iw) * 4]); + } else { + reg_cache[i][j] = zero; + } + } + } + +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int x = idx_in_warp + j * WARP_SIZE; + int y = warp_id + i * TileCounter_::warps_per_block; + if (y < TileCounter_::global_load_tile_y && + x < TileCounter_::global_load_tile_x) { + *(uint32_t*)(&smem[y][x * 4]) = reg_cache[i][j]; + } + } + } + + __syncthreads(); + + const int ic_blks = (IC + 7) / 8; +#pragma unroll + for (int c = 0; c < ic_blks; ++c) { + sptr += IH * IW * 4; + if (c < ic_blks - 1) { +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int iw = idx_in_warp + j * WARP_SIZE; + int ih = warp_id + i * TileCounter_::warps_per_block; + if (ih_base + ih >= 0 && ih_base + ih < IH && + iw_base + iw >= 0 && iw_base + iw < IW) { + reg_cache[i][j] = + *(const uint32_t*)(&sptr[(ih * IW + iw) * 4]); + } else { + reg_cache[i][j] = zero; + } + } + } + } + +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + int x = (j * thread_blk_x + tidx) * ConvConfig_::SW; + int y = (i * thread_blk_y + tidy) * ConvConfig_::SH; +#pragma unroll + for (int fh = 0; fh < ConvConfig_::FH; ++fh) { +#pragma unroll + for (int fw = 0; fw < ConvConfig_::FW; ++fw) { + uint32_t sdata = + *(uint32_t*)(&smem[y + fh][(x + fw) * 4]); +#pragma unroll + for (int r = 0; r < 8; r++) { + uint8_t val = (sdata & 0xF); + acc[i][j] += val; + sdata >>= 4; + } + } + } + } + } + + if (c < ic_blks - 1) { + __syncthreads(); +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int x = idx_in_warp + j * WARP_SIZE; + int y = warp_id + i * TileCounter_::warps_per_block; + if (y < TileCounter_::global_load_tile_y && + x < TileCounter_::global_load_tile_x) { + *(uint32_t*)(&smem[y][x * 4]) = reg_cache[i][j]; + } + } + } + __syncthreads(); + } + } + +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + int x = j * thread_blk_x + tidx; + int y = i * thread_blk_y + tidy; + if (oh_start + y < OH && ow_start + x < OW) { + dptr[y * OW + x] = acc[i][j] * scale; + } + } + } +} + +template +struct LargeChannelTileCounter { + static const size_t spatial_tile_x = thread_blk_x * pixels_per_thread_x; + static const size_t spatial_tile_y = pixels_per_thread_y; + static const size_t global_load_tile_x = + (spatial_tile_x - 1) * ConvConfig::SW + ConvConfig::FW; + static const size_t global_load_tile_y = + (spatial_tile_y - 1) * ConvConfig::SH + ConvConfig::FH; + static const size_t reg_cache_x = + (global_load_tile_x + WARP_SIZE - 1) / WARP_SIZE; + static const size_t warps_per_block = + (thread_blk_x * thread_blk_y) / WARP_SIZE; + static const size_t reg_cache_y = + (global_load_tile_y * thread_blk_y + warps_per_block - 1) / + warps_per_block; + static const size_t smem_stride = + global_load_tile_x + (global_load_tile_x % 2 == 0); + static const size_t reduce_dim_0 = thread_blk_y; + static const size_t reduce_dim_1 = pixels_per_thread_y; + static const size_t reduce_dim_2 = thread_blk_x * pixels_per_thread_x; +}; + +template +__global__ void +reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels( + int32_t* __restrict__ dst, const uint8_t* __restrict__ src, int IC, + int IH, int IW, int OH, int OW, int PH, int PW, int32_t scale, + int32_t zero) { + typedef LargeChannelTileCounter + TileCounter_; + + const int bidx = blockIdx.x; + const int bidz = blockIdx.z; + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int blocks_per_row = (OW + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + const int bidw = bidx % blocks_per_row; + const int bidh = bidx / blocks_per_row; + + const int oh_start = bidh * TileCounter_::spatial_tile_y; + const int ow_start = bidw * TileCounter_::spatial_tile_x; + const int ih_base = oh_start * ConvConfig_::SH - PH; + const int iw_base = ow_start * ConvConfig_::SW - PW; + const uint8_t* __restrict__ sptr = + src + bidz * IC * IH * IW / 2 + (ih_base * IW + iw_base) * 4; + + __shared__ uint8_t smem[thread_blk_y][TileCounter_::global_load_tile_y] + [TileCounter_::smem_stride * 4]; + __shared__ int32_t + s_reduce[TileCounter_::reduce_dim_0][TileCounter_::reduce_dim_1] + [TileCounter_::reduce_dim_2 + 1]; + uint32_t reg_cache[TileCounter_::reg_cache_y][TileCounter_::reg_cache_x]; + int32_t acc[pixels_per_thread_y][pixels_per_thread_x]; + + int32_t* __restrict__ dptr = + dst + bidz * OH * OW + ow_start + oh_start * OW; + + const int tid = tidy * thread_blk_x + tidx; + const int idx_in_warp = tid % WARP_SIZE; + const int warp_id = tid / WARP_SIZE; + const int ic_blks = IC / 8; + +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + acc[i][j] = 0; + } + } + +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int iw = idx_in_warp + j * WARP_SIZE; + int hc = warp_id + i * TileCounter_::warps_per_block; + int ih = hc % TileCounter_::global_load_tile_y; + int ic_blk = hc / TileCounter_::global_load_tile_y; + if (ih_base + ih >= 0 && ih_base + ih < IH && iw_base + iw >= 0 && + iw_base + iw < IW) { + reg_cache[i][j] = 0; + if (ic_blk < ic_blks) + reg_cache[i][j] = + *(const uint32_t*)(&sptr[(ic_blk * IH * IW + + ih * IW + iw) * + 4]); + } else { + reg_cache[i][j] = (ic_blk < ic_blks) ? zero : 0; + } + } + } + +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int x = idx_in_warp + j * WARP_SIZE; + int hc = warp_id + i * TileCounter_::warps_per_block; + int ih = hc % TileCounter_::global_load_tile_y; + int ic_blk = hc / TileCounter_::global_load_tile_y; + if (ic_blk < thread_blk_y && x < TileCounter_::global_load_tile_x) { + *(uint32_t*)(&smem[ic_blk][ih][x * 4]) = reg_cache[i][j]; + } + } + } + + __syncthreads(); + + int blks = (ic_blks + thread_blk_y - 1) / thread_blk_y; +#pragma unroll + for (int c = 0; c < blks; ++c) { + sptr += IH * IW * thread_blk_y * 4; + if (c < blks - 1) { +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int iw = idx_in_warp + j * WARP_SIZE; + int hc = warp_id + i * TileCounter_::warps_per_block; + int ih = hc % TileCounter_::global_load_tile_y; + int ic_blk = hc / TileCounter_::global_load_tile_y; + int g_ic_blk = ic_blk + c * thread_blk_y; + if (ih_base + ih >= 0 && ih_base + ih < IH && + iw_base + iw >= 0 && iw_base + iw < IW) { + reg_cache[i][j] = 0; + if (g_ic_blk < ic_blks) + reg_cache[i][j] = + *(const uint32_t*)(&sptr[(ic_blk * IH * IW + + ih * IW + iw) * + 4]); + } else { + reg_cache[i][j] = (g_ic_blk < ic_blks) ? zero : 0; + } + } + } + } + +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + int x = (j * thread_blk_x + tidx) * ConvConfig_::SW; + int y = i * ConvConfig_::SH; +#pragma unroll + for (int fh = 0; fh < ConvConfig_::FH; ++fh) { +#pragma unroll + for (int fw = 0; fw < ConvConfig_::FW; ++fw) { + uint32_t sdata = + *(uint32_t*)(&smem[tidy][y + fh][(x + fw) * 4]); +#pragma unroll + for (int r = 0; r < 8; r++) { + uint8_t val = (sdata & 0xF); + acc[i][j] += val; + sdata >>= 4; + } + } + } + } + } + + if (c < blks - 1) { + __syncthreads(); +#pragma unroll + for (int i = 0; i < TileCounter_::reg_cache_y; ++i) { +#pragma unroll + for (int j = 0; j < TileCounter_::reg_cache_x; ++j) { + int x = idx_in_warp + j * WARP_SIZE; + int hc = warp_id + i * TileCounter_::warps_per_block; + int ih = hc % TileCounter_::global_load_tile_y; + int ic_blk = hc / TileCounter_::global_load_tile_y; + if (ic_blk < thread_blk_y && + x < TileCounter_::global_load_tile_x) { + *(uint32_t*)(&smem[ic_blk][ih][x * 4]) = + reg_cache[i][j]; + } + } + } + __syncthreads(); + } + } + +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + s_reduce[tidy][i][tidx + j * thread_blk_x] = acc[i][j]; + } + } + + const int nr_ty_per_warp = WARP_SIZE / thread_blk_x; +#pragma unroll + for (int k = (thread_blk_y >> 1); k; k >>= 1) { + if (k >= nr_ty_per_warp) { + __syncthreads(); + } else { + cub::WARP_SYNC(0xffffffff); + } + if (tidy < k) { +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + s_reduce[tidy][i][tidx + j * thread_blk_x] += + s_reduce[tidy + k][i][tidx + j * thread_blk_x]; + } + } + } + } + + if (tidy == 0) { +#pragma unroll + for (int i = 0; i < pixels_per_thread_y; ++i) { +#pragma unroll + for (int j = 0; j < pixels_per_thread_x; ++j) { + int x = j * thread_blk_x + tidx; + int y = i; + if (oh_start + y < OH && ow_start + x < OW) { + dptr[y * OW + x] = + s_reduce[0][i][tidx + j * thread_blk_x] * scale; + } + } + } + } +} + +} // namespace + +void megdnn::cuda::_do_dispatch_reduce_with_scale_data_u4( + int32_t* dst, const uint8_t* src, int batch_size, int ih, int iw, + int oh, int ow, int ph, int pw, int fh, int fw, int sh, int sw, int ic, + int32_t scale, uint8_t zp_data, cudaStream_t stream) { + zp_data = (zp_data << 4) | zp_data; + int32_t zero = (zp_data << 24) | (zp_data << 16) | (zp_data << 8) | zp_data; + if (fh == 3 && fw == 3 && sh == 1 && sw == 1) { + typedef ConvConfig<3, 3, 1, 1> ConvConfig_; + if (ic <= 32 && iw >= 128) { + constexpr size_t thread_blk_x_ = WARP_SIZE; + constexpr size_t thread_blk_y_ = 2; + constexpr size_t pixels_per_thread_x_ = 4; + constexpr size_t pixels_per_thread_y_ = 2; + + typedef TileCounter + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row; + gridDim.y = blocks_per_col; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, oh, + ow, ph, pw, scale, zero); + } else { + if (iw <= 32) { + constexpr size_t thread_blk_x_ = WARP_SIZE / 2; + constexpr size_t thread_blk_y_ = 8; + constexpr size_t pixels_per_thread_x_ = 1; + constexpr size_t pixels_per_thread_y_ = 4; + + typedef LargeChannelTileCounter< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = 1; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, + oh, ow, ph, pw, + scale, zero); + } else { + constexpr size_t thread_blk_x_ = WARP_SIZE / 2; + constexpr size_t thread_blk_y_ = 4; + constexpr size_t pixels_per_thread_x_ = 4; + constexpr size_t pixels_per_thread_y_ = 4; + + typedef LargeChannelTileCounter< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = 1; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, + oh, ow, ph, pw, + scale, zero); + } + } + } else if (fh == 5 && fw == 5 && sh == 1 && sw == 1) { + typedef ConvConfig<5, 5, 1, 1> ConvConfig_; + if (ic <= 32 && iw >= 128) { + constexpr size_t thread_blk_x_ = WARP_SIZE; + constexpr size_t thread_blk_y_ = 2; + constexpr size_t pixels_per_thread_x_ = 4; + constexpr size_t pixels_per_thread_y_ = 2; + + typedef TileCounter + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row; + gridDim.y = blocks_per_col; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, oh, + ow, ph, pw, scale, zero); + } else { + if (iw <= 32) { + constexpr size_t thread_blk_x_ = WARP_SIZE / 2; + constexpr size_t thread_blk_y_ = 8; + constexpr size_t pixels_per_thread_x_ = 1; + constexpr size_t pixels_per_thread_y_ = 4; + + typedef LargeChannelTileCounter< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = 1; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, + oh, ow, ph, pw, + scale, zero); + + } else { + constexpr size_t thread_blk_x_ = WARP_SIZE / 2; + constexpr size_t thread_blk_y_ = 4; + constexpr size_t pixels_per_thread_x_ = 4; + constexpr size_t pixels_per_thread_y_ = 4; + + typedef LargeChannelTileCounter< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = 1; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, + oh, ow, ph, pw, + scale, zero); + } + } + } else if (fh == 7 && fw == 7 && sh == 1 && sw == 1) { + typedef ConvConfig<7, 7, 1, 1> ConvConfig_; + if (ic <= 32 && iw >= 128) { + constexpr size_t thread_blk_x_ = WARP_SIZE; + constexpr size_t thread_blk_y_ = 2; + constexpr size_t pixels_per_thread_x_ = 4; + constexpr size_t pixels_per_thread_y_ = 2; + + typedef TileCounter + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row; + gridDim.y = blocks_per_col; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, oh, + ow, ph, pw, scale, zero); + } else { + constexpr size_t thread_blk_x_ = WARP_SIZE / 2; + constexpr size_t thread_blk_y_ = 8; + constexpr size_t pixels_per_thread_x_ = 1; + constexpr size_t pixels_per_thread_y_ = 4; + + typedef LargeChannelTileCounter + TileCounter_; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (ow + TileCounter_::spatial_tile_x - 1) / + TileCounter_::spatial_tile_x; + int blocks_per_col = (oh + TileCounter_::spatial_tile_y - 1) / + TileCounter_::spatial_tile_y; + blockDim.x = thread_blk_x_; + blockDim.y = thread_blk_y_; + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = 1; + gridDim.z = batch_size; + + reduce_in_spatial_block_and_along_input_channel_with_scale_u4_large_channels< + ConvConfig_, thread_blk_x_, thread_blk_y_, + pixels_per_thread_x_, pixels_per_thread_y_> + <<>>(dst, src, ic, ih, iw, oh, + ow, ph, pw, scale, zero); + } + } + after_kernel_launch(); +} + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh new file mode 100644 index 00000000..462f5af4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh @@ -0,0 +1,47 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +void _do_dispatch_reduce_with_scale_data_u4( + int32_t* dst, const uint8_t* src, int batch_size, int ih, int iw, + int oh, int ow, int ph, int pw, int fh, int fw, int sh, int sw, int ic, + int32_t scale, uint8_t zp_data, cudaStream_t stream); +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu new file mode 100644 index 00000000..e307dfe5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu @@ -0,0 +1,100 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./reduce_with_scale_filter.cuh" +#include "src/cuda/reduce_helper.cuh" + +using namespace megdnn; +using namespace cuda; + +namespace { + +struct ReduceWithScaleUInt4Op { + typedef int32_t wtype; + const uint8_t* src; + int32_t* dst; + int32_t scale; + static const wtype INIT = 0; + +#if MEGDNN_CC_CUDA + __host__ __device__ void write(uint32_t idx, wtype val) { + dst[idx] = val * scale; + } + + __host__ __device__ static wtype apply(wtype a, wtype b) { return a + b; } + + __device__ wtype read(uint32_t idx) { + constexpr uint32_t subbytes_per_pixel = 8; + const uint32_t* sptr = + (const uint32_t*)(src + subbytes_per_pixel * idx / 2); + uint32_t val = *sptr; + int32_t ret = 0; +#pragma unroll + for (int j = 0; j < 8; j++) { + uint8_t cur = (val & 0xF); + ret += cur; + val = (val >> 4); + } + return ret; + } +#endif +}; + +} // namespace + +void megdnn::cuda::_do_dispatch_reduce_with_scale_filter_u4( + const uint8_t* src, int32_t scale, uint32_t rows, uint32_t cols, + int32_t* dst, cudaStream_t stream) { + // rows = OC + // cols is measured in pixels, i.e. IC * FH * FW / 8, a pixel consists of 8 + // subbyte data, + ReduceWithScaleUInt4Op op; + op.src = src; + op.scale = scale; + op.dst = dst; + static_cast(op); + static_cast(stream); + static_cast(rows); + static_cast(cols); + run_reduce(dst + rows, rows, cols, 1, stream, + op); +} + +size_t megdnn::cuda::_do_dispatch_reduce_workspace_in_bytes(size_t A, size_t B, + size_t C) { + return get_reduce_workspace_in_bytes(A, B, C); +} + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh new file mode 100644 index 00000000..6781da2f --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh @@ -0,0 +1,48 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_filter.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +void _do_dispatch_reduce_with_scale_filter_u4(const uint8_t* src, int32_t scale, + uint32_t rows, uint32_t cols, + int32_t* dst, + cudaStream_t stream); +size_t _do_dispatch_reduce_workspace_in_bytes(size_t A, size_t B, size_t C); +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4.cuh b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4.cuh new file mode 100644 index 00000000..88c58db6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4.cuh @@ -0,0 +1,81 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include +#if CUDA_VERSION >= 10000 +#include +#endif + +namespace megdnn { +namespace cuda { +namespace wmma_conv_integer_subbyte { + +constexpr size_t WARP_SIZE = 32; +constexpr size_t WMMA_M = 8; +constexpr size_t WMMA_N = 8; +constexpr size_t WMMA_K = 32; +constexpr size_t IC_BLK = WMMA_K / 8; +constexpr size_t SKEW = 32; + +template +struct ConvConfig { + static int const FH = FH_; + static int const FW = FW_; + static int const SH = SH_; + static int const SW = SW_; +}; + +void _do_wmma_conv_integer_subbyte_1xfw(const uint8_t* d_data, + const uint8_t* d_filter, int32_t* d_out, + uint8_t* workspace, int batch_size, + int hi, int wi, int ho, int wo, int ph, + int pw, int ci, int co, int fh, int fw, + int sh, int sw, uint8_t zp_data, + cudaStream_t stream); + +void _do_wmma_conv_integer_subbyte_fhxfw(const uint8_t* d_data, + const uint8_t* d_filter, + int32_t* d_out, int batch_size, int hi, + int wi, int ho, int wo, int ph, int pw, + int ci, int co, int fh, int fw, int sh, + int sw, uint8_t zp_data, + cudaStream_t stream); + +} // namespace wmma_conv_integer_subbyte +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_1xfw.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_1xfw.cu new file mode 100644 index 00000000..cf9a68ab --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_1xfw.cu @@ -0,0 +1,677 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_1xfw.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.cuh" +#include "wmma_conv_integer_u4.cuh" + +#if __CUDA_ARCH__ >= 730 +using namespace nvcuda; +using namespace wmma::experimental::precision; +#endif + +using namespace megdnn; +using namespace cuda; +using namespace wmma_conv_integer_subbyte; + +namespace wmma_conv_integer_subbyte_1xfw { + +template +struct BlockConfig { + static int const WARPS_W = WARPS_W_; + static int const WARPS_OC = WARPS_OC_; + static int const OUT_CHANNELS_PER_WARP = OUT_CHANNELS_PER_WARP_; + static int const OH_PER_WARP = OH_PER_WARP_; + static int const IC_UNROLL_SIZE = IC_UNROLL_SIZE_; + static int const IC_BLKS = IC_BLK * IC_UNROLL_SIZE; + static int const WARPS_PER_BLOCK = WARPS_W * WARPS_OC; +}; + +template +struct DataCount { + static int const LANE_SIZE = + BlockConfig::WARPS_W * WMMA_M * ConvConfig::SW + ConvConfig::FW - 1; + static int const LANES_PER_SLICE = BlockConfig::OH_PER_WARP; + static int const LANES_PER_BLOCK = + LANES_PER_SLICE * IC_BLK * BlockConfig::IC_UNROLL_SIZE; + static int const LANES_PER_WARP = + (LANES_PER_BLOCK + BlockConfig::WARPS_PER_BLOCK - 1) / + BlockConfig::WARPS_PER_BLOCK; + static int const SMEM_SKEW = (BlockConfig::IC_UNROLL_SIZE % 2 == 0) * SKEW; + static int const SMEM_DATA_COL = (BlockConfig::IC_BLKS * 8 + SMEM_SKEW) / 2; + static int const SMEM_DATA_STRIDE = SMEM_DATA_COL * 2; + static int const SMEM_DATA_ROW = LANE_SIZE * LANES_PER_SLICE; +}; + +template +struct FilterCount { + static int const OUT_CHANNELS_PER_BLOCK = + WMMA_M * BlockConfig::WARPS_OC * BlockConfig::OUT_CHANNELS_PER_WARP; + static int const SMEM_FILTER_ROW = OUT_CHANNELS_PER_BLOCK; + static int const SMEM_SKEW = + ((ConvConfig::FW * BlockConfig::IC_UNROLL_SIZE) % 2 == 0) * SKEW; + static int const SMEM_FILTER_COL = + (BlockConfig::IC_BLKS * ConvConfig::FW * 8 + SMEM_SKEW) / 2; + static int const SMEM_FILTER_STRIDE = SMEM_FILTER_COL * 2; + static int const REG_FILTER_ROW = + (SMEM_FILTER_ROW + BlockConfig::WARPS_PER_BLOCK - 1) / + BlockConfig::WARPS_PER_BLOCK; + static int const REG_FILTER_COL = + (BlockConfig::IC_BLKS * ConvConfig::FW + WARP_SIZE - 1) / WARP_SIZE; +}; + +#if __CUDA_ARCH__ >= 730 +template +struct ConvDataGlobal2ShareMemVisitor { + typedef int32_t copy_t; + uint8_t* smem; + const uint8_t* g_ptr; + + int ci_stride, hi_stride; + int IH, IW; + int b_ih, b_iw; + copy_t zero; + int idx; + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + const int tid_in_warp = threadIdx.x % WARP_SIZE; + const int warp_id = (warp_y * BlockConfig_::WARPS_W + warp_x); + + copy_t reg_cache[DataCount::LANES_PER_WARP]; + + __device__ ConvDataGlobal2ShareMemVisitor(uint8_t* smem, + const uint8_t* g_ptr, int IH, + int IW, int b_ih, int b_iw, + copy_t zero) + : smem{smem}, + g_ptr{g_ptr}, + IH{IH}, + IW{IW}, + b_ih{b_ih}, + b_iw{b_iw}, + zero{zero} { + ci_stride = 8 * IH * IW; + hi_stride = 8 * IW; + idx = 0; + } + + // not perfectly + __device__ __forceinline__ void copy() { + typedef DataCount DataCount_; + int col = (tid_in_warp << 3); + int b_ih_base = b_ih + (idx % ConvConfig_::FH); +#pragma unroll + for (int i = 0; i < DataCount_::LANES_PER_WARP; ++i) { + int row = i * BlockConfig_::WARPS_PER_BLOCK + warp_id; + int ci_idx = row / DataCount_::LANES_PER_SLICE; + int hi_idx = row - ci_idx * DataCount_::LANES_PER_SLICE; + if (idx % ConvConfig_::FH != 0 && + hi_idx < BlockConfig_::OH_PER_WARP - 1) { + int y = (hi_idx + + 1) * DataCount::LANE_SIZE + + tid_in_warp; + int x = ci_idx * 8; + if (tid_in_warp < DataCount_::LANE_SIZE) + reg_cache[i] = *(copy_t*)(get_smem_ptr(y, x)); + } else { + bool cond = ((b_iw + tid_in_warp) >= 0) && + ((b_iw + tid_in_warp) < IW) && + ((b_ih_base + hi_idx) >= 0) && + ((b_ih_base + hi_idx) < IH); + if (cond) { + copy_t val = *(copy_t*)(&g_ptr[(ci_idx * ci_stride + + hi_idx * hi_stride + col) / + 2]); + reg_cache[i] = val; + } else { + reg_cache[i] = zero; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; + i < DataCount::LANES_PER_WARP; ++i) { + if (tid_in_warp < DataCount::LANE_SIZE) { + int row = i * BlockConfig_::WARPS_PER_BLOCK + warp_id; + int ci_idx = + row / + DataCount::LANES_PER_SLICE; + int hi_idx = + row - ci_idx * DataCount::LANES_PER_SLICE; + int y = hi_idx * DataCount::LANE_SIZE + + tid_in_warp; + int x = ci_idx * 8; + *(copy_t*)(get_smem_ptr(y, x)) = reg_cache[i]; + } + } + } + + __device__ __forceinline__ uint8_t* get_smem_ptr(int y, int x) { + return &smem[(y * DataCount::SMEM_DATA_STRIDE + + x) / + 2]; + } + + __device__ __forceinline__ void inc_stage() { + idx++; + g_ptr += idx % ConvConfig_::FH == 0 + ? (BlockConfig_::IC_BLKS * ci_stride - + (ConvConfig_::FH - 1) * hi_stride) / + 2 + : hi_stride / 2; + } +}; + +template +struct ConvFilterGlobal2ShareMemVisitor { + uint8_t* smem; + const uint8_t* g_ptr; + + int co_stride, co_remain; + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + const int tid_in_warp = threadIdx.x % WARP_SIZE; + const int warp_id = (warp_y * BlockConfig_::WARPS_W + warp_x); + + typedef int32_t copy_t; + copy_t reg_cache[FilterCount::REG_FILTER_ROW] + [FilterCount::REG_FILTER_COL]; + + __device__ ConvFilterGlobal2ShareMemVisitor(uint8_t* smem, + const uint8_t* g_ptr, + int co_stride, int co_remain) + : smem{smem}, + g_ptr{g_ptr}, + co_stride{co_stride}, + co_remain{co_remain} {} + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; + i < FilterCount::REG_FILTER_ROW; ++i) { +#pragma unroll + for (int j = 0; + j < FilterCount::REG_FILTER_COL; + ++j) { + int y = BlockConfig_::WARPS_PER_BLOCK * i + warp_id; + int x = WARP_SIZE * j + tid_in_warp; + bool valid = + (y < + FilterCount::OUT_CHANNELS_PER_BLOCK) && + (x < BlockConfig_::IC_BLKS * ConvConfig_::FW) && + (y < co_remain); + if (valid) { + copy_t val = *(copy_t*)(&g_ptr[y * co_stride + x * 4]); + reg_cache[i][j] = val; + } else { + reg_cache[i][j] = 0; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; + i < FilterCount::REG_FILTER_ROW; ++i) { +#pragma unroll + for (int j = 0; + j < FilterCount::REG_FILTER_COL; + ++j) { + int y = BlockConfig_::WARPS_PER_BLOCK * i + warp_id; + int x = WARP_SIZE * j + tid_in_warp; + bool bounds = + (y < + FilterCount::OUT_CHANNELS_PER_BLOCK) && + (x < BlockConfig_::IC_BLKS * ConvConfig_::FW); + copy_t val = reg_cache[i][j]; + if (bounds) + *(copy_t*)get_smem_ptr(y, x * 8) = val; + } + } + } + + __device__ __forceinline__ uint8_t* get_smem_ptr(int y, int x) { + return &smem[(y * FilterCount::SMEM_FILTER_STRIDE + + x) / + 2]; + } + + __device__ __forceinline__ void inc_stage() { + g_ptr += BlockConfig_::IC_BLKS * ConvConfig_::FW * 4; + } +}; + +template +__device__ inline void +calc(wmma::fragment + data_frag[OH_PER_WARP], + wmma::fragment + filter_frag[OUT_CHANNELS_PER_WARP], + wmma::fragment + acc_frag[OUT_CHANNELS_PER_WARP][OH_PER_WARP]) { +#pragma unroll + for (int i = 0; i < OUT_CHANNELS_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < OH_PER_WARP; ++j) { + wmma::mma_sync(acc_frag[i][j], filter_frag[i], data_frag[j], + acc_frag[i][j]); + } + } +} + +template +struct enable_kernel_partial_spec; + +template +struct enable_kernel_partial_spec { + static __device__ inline void load_share_mem( + wmma::fragment + data_frag[BlockConfig_::OH_PER_WARP], + wmma::fragment + filter_frag[BlockConfig_::OUT_CHANNELS_PER_WARP], + ConvDataGlobal2ShareMemVisitor& + gbl2smem_data_visitor, + ConvFilterGlobal2ShareMemVisitor& + gbl2smem_filter_visitor, + int data_spatial_idx, int fw, int ic_blk) { + const int warp_y = threadIdx.y; + uint8_t* __restrict__ s_ptr_data = gbl2smem_data_visitor.get_smem_ptr( + data_spatial_idx, ic_blk * WMMA_K); + uint8_t* __restrict__ s_ptr_filter = + gbl2smem_filter_visitor.get_smem_ptr( + warp_y * WMMA_M, + fw * WMMA_K * BlockConfig_::IC_UNROLL_SIZE + + ic_blk * WMMA_K); + +#pragma unroll + for (int i = 0; i < BlockConfig_::OH_PER_WARP; ++i) { + wmma::load_matrix_sync( + data_frag[i], + s_ptr_data + + i * + DataCount::LANE_SIZE * + DataCount::SMEM_DATA_STRIDE / + 2, + DataCount::SMEM_DATA_STRIDE); + } +#pragma unroll + for (int j = 0; j < BlockConfig_::OUT_CHANNELS_PER_WARP; ++j) { + wmma::load_matrix_sync( + filter_frag[j], + s_ptr_filter + + j * WMMA_M * BlockConfig_::WARPS_OC * + FilterCount:: + SMEM_FILTER_STRIDE / + 2, + FilterCount::SMEM_FILTER_STRIDE); + } + } + + template + static __device__ void consume_slice( + ConvDataGlobal2ShareMemVisitor& + gbl2smem_data_visitor, + ConvFilterGlobal2ShareMemVisitor& + gbl2smem_filter_visitor, + wmma::fragment + data_frag[2][BlockConfig_::OH_PER_WARP], + wmma::fragment + filter_frag[2][BlockConfig_::OUT_CHANNELS_PER_WARP], + wmma::fragment + acc_frag[BlockConfig_::OUT_CHANNELS_PER_WARP] + [BlockConfig_::OH_PER_WARP]) { + if (!last_slice) { + gbl2smem_data_visitor.inc_stage(); + gbl2smem_filter_visitor.inc_stage(); + gbl2smem_data_visitor.copy(); + gbl2smem_filter_visitor.copy(); + } + + int data_spatial_idx_base = threadIdx.x / WARP_SIZE * WMMA_N; + int loop_count = 0; +#pragma unroll + for (; loop_count < BlockConfig_::IC_UNROLL_SIZE * ConvConfig_::FW - 1; + loop_count++) { + calc(data_frag[loop_count % 2], + filter_frag[loop_count % 2], + acc_frag); + + int fw = (loop_count + 1) / BlockConfig_::IC_UNROLL_SIZE; + int ic_blk = (loop_count + 1) % BlockConfig_::IC_UNROLL_SIZE; + int data_spatial_idx = data_spatial_idx_base + fw; + + load_share_mem(data_frag[(loop_count + 1) % 2], + filter_frag[(loop_count + 1) % 2], + gbl2smem_data_visitor, gbl2smem_filter_visitor, + data_spatial_idx, fw, ic_blk); + } + + calc( + data_frag[(loop_count % 2)], filter_frag[(loop_count % 2)], + acc_frag); + if (!last_slice) { + __syncthreads(); + gbl2smem_data_visitor.commit(); + gbl2smem_filter_visitor.commit(); + __syncthreads(); + load_share_mem(data_frag[0], filter_frag[0], gbl2smem_data_visitor, + gbl2smem_filter_visitor, data_spatial_idx_base, 0, + 0); + } + } +}; + +template +__global__ void convolution_template_device_u4( + const uint8_t* __restrict__ data, const uint8_t* __restrict__ filter, + int32_t* __restrict__ out, int N, int IH, int IW, int OH, int OW, + int PH, int PW, int IC, int OC, int32_t zero) { + typedef enable_kernel_partial_spec caller; + constexpr size_t IC_BLKS = BlockConfig_::IC_BLKS; + constexpr size_t OUT_CHANNELS_PER_BLOCK = + FilterCount::OUT_CHANNELS_PER_BLOCK; + + const int blocks_per_row = (OW + WMMA_N * BlockConfig_::WARPS_W - 1) / + (WMMA_N * BlockConfig_::WARPS_W); + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + const int b_oh = bidx / blocks_per_row * BlockConfig_::OH_PER_WARP; + const int b_ow = bidx % blocks_per_row * (WMMA_N * BlockConfig_::WARPS_W); + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + + const int oc_start = bidy * OUT_CHANNELS_PER_BLOCK + warp_y * WMMA_M; + const int ow_start = b_ow + warp_x * WMMA_N; + const int b_ih = b_oh * ConvConfig_::SH - PH; + const int b_iw = b_ow * ConvConfig_::SW - PW; + + const uint8_t* __restrict__ g_ptr_data = + data + bidz * IC * IH * IW / 2 + (b_ih * IW + b_iw) * 4; + const uint8_t* __restrict__ g_ptr_filter = + filter + bidy * OUT_CHANNELS_PER_BLOCK * ConvConfig_::FH * + ConvConfig_::FW * IC / 2; + const int co_remain = OC - bidy * OUT_CHANNELS_PER_BLOCK; + int32_t* __restrict__ g_ptr_out = out + bidz * OC * OH * OW + + oc_start * OH * OW + + (b_oh * OW + ow_start) * WMMA_M; + + __shared__ uint8_t + smem_data[DataCount::SMEM_DATA_ROW] + [DataCount::SMEM_DATA_COL]; + __shared__ uint8_t smem_filter + [FilterCount::SMEM_FILTER_ROW] + [FilterCount::SMEM_FILTER_COL]; + + ConvDataGlobal2ShareMemVisitor + gbl2smem_data_visitor{smem_data[0], g_ptr_data, IH, IW, + b_ih, b_iw, zero}; + ConvFilterGlobal2ShareMemVisitor + gbl2smem_filter_visitor{smem_filter[0], g_ptr_filter, + IC / 2 * ConvConfig_::FH * ConvConfig_::FW, + co_remain}; + + wmma::fragment + acc_frag[BlockConfig_::OUT_CHANNELS_PER_WARP] + [BlockConfig_::OH_PER_WARP]; + wmma::fragment + data_frag[2][BlockConfig_::OH_PER_WARP]; + wmma::fragment + filter_frag[2][BlockConfig_::OUT_CHANNELS_PER_WARP]; + +#pragma unroll + for (int i = 0; i < BlockConfig_::OUT_CHANNELS_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < BlockConfig_::OH_PER_WARP; ++j) { + wmma::fill_fragment(acc_frag[i][j], 0); + } + } + + gbl2smem_data_visitor.copy(); + gbl2smem_filter_visitor.copy(); + gbl2smem_data_visitor.commit(); + gbl2smem_filter_visitor.commit(); + + __syncthreads(); + + caller::load_share_mem(data_frag[0], filter_frag[0], gbl2smem_data_visitor, + gbl2smem_filter_visitor, warp_x * WMMA_N, 0, 0); + + int ic_blocks = (IC / 8 + IC_BLKS - 1) / IC_BLKS * ConvConfig_::FH - 1; +#pragma unroll + for (int ci_blk = 0; ci_blk < ic_blocks; ci_blk++) { + caller::consume_slice(gbl2smem_data_visitor, + gbl2smem_filter_visitor, data_frag, + filter_frag, acc_frag); + } + caller::consume_slice(gbl2smem_data_visitor, gbl2smem_filter_visitor, + data_frag, filter_frag, acc_frag); + + // store +#pragma unroll + for (int i = 0; i < BlockConfig_::OUT_CHANNELS_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < BlockConfig_::OH_PER_WARP; ++j) { + if (b_oh + j < OH && + oc_start + i * BlockConfig_::WARPS_OC * WMMA_M < OC && + ow_start < OW) { + wmma::store_matrix_sync(&g_ptr_out[i * BlockConfig_::WARPS_OC * + WMMA_M * OH * OW + + j * OW * WMMA_M], + acc_frag[i][j], WMMA_M, + wmma::mem_col_major); + } + } + } +} +#else +template +__global__ void convolution_template_device_u4( + const uint8_t* __restrict__ /* data */, + const uint8_t* __restrict__ /* filter */, + int32_t* __restrict__ /* out */, int /* N */, int /* IH */, + int /* IW */, int /* OH */, int /* OW */, int /* PH */, int /* PW */, + int /* IC */, int /* OC */, int32_t /* zero */) {} +#endif + +__global__ void reorder_kernel(const uint32_t* __restrict__ src, + uint32_t* __restrict__ dst, int rows, int cols, + int fh, int fw, int ic_blks) { + const int tidx = blockIdx.x * blockDim.x + threadIdx.x; + const int tidy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t* __restrict__ sptr = src + tidy * cols + tidx; + uint32_t* __restrict__ dptr = dst + tidy * cols; + if (tidy < rows && tidx < cols) { + int spatial_idx = tidx % (fh * fw); + int kh = spatial_idx / fw; + int kw = spatial_idx % fw; + int ci_blk = tidx / (fh * fw); + int ci_inner_blk = ci_blk % ic_blks; + int ci_outer_blk = ci_blk / ic_blks; + int out_x = ci_outer_blk * ic_blks * fh * fw + kh * ic_blks * fw + + kw * ic_blks + ci_inner_blk; + dptr[out_x] = (*sptr); + } +} +} // namespace wmma_conv_integer_subbyte_1xfw + +using namespace wmma_conv_integer_subbyte_1xfw; + +void megdnn::cuda::wmma_conv_integer_subbyte:: + _do_wmma_conv_integer_subbyte_1xfw( + const uint8_t* d_data, const uint8_t* d_filter, int32_t* d_out, + uint8_t* workspace, int batch_size, int hi, int wi, int ho, + int wo, int ph, int pw, int ci, int co, int fh, int fw, int sh, + int sw, uint8_t zp_data, cudaStream_t stream) { + cuda_check(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); + cuda_check(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); + zp_data = (zp_data << 4) | zp_data; + int32_t zero = (zp_data << 24) | (zp_data << 16) | (zp_data << 8) | zp_data; + auto _do_dispatch_reorder_kernel = [&](int ic_blks) { + int tx = 32; + int ty = 16; + int bx = (ci * fh * fw / 8 + tx - 1) / tx; + int by = (co + ty - 1) / ty; + reorder_kernel<<>>( + reinterpret_cast(d_filter), + reinterpret_cast(workspace), co, ci * fh * fw / 8, + fh, fw, ic_blks); + }; + + if (fh == 3 && fw == 3 && sh == 1 && sw == 1) { + constexpr size_t warps_w = 2; + constexpr size_t warps_oc = 4; + constexpr size_t out_channels_per_warp = 4; + constexpr size_t oh_per_warp = 8; + constexpr size_t ic_unroll_size = 2; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (wo + WMMA_N * warps_w - 1) / (WMMA_N * warps_w); + int blocks_per_col = (ho + oh_per_warp - 1) / (oh_per_warp); + int blocks_per_out_channel = + (co + WMMA_M * warps_oc * out_channels_per_warp - 1) / + (WMMA_M * warps_oc * out_channels_per_warp); + + blockDim.x = WARP_SIZE * warps_w; + blockDim.y = warps_oc; + blockDim.z = 1; + + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = blocks_per_out_channel; + gridDim.z = batch_size; + + typedef BlockConfig + BlockConfig_; + _do_dispatch_reorder_kernel(BlockConfig_::IC_BLKS); + convolution_template_device_u4< + ConvConfig<3, 3, 1, 1>, + BlockConfig> + <<>>(d_data, workspace, d_out, + batch_size, hi, wi, ho, wo, + ph, pw, ci, co, zero); + + } else if (fh == 5 && fw == 5 && sh == 1 && sw == 1) { + constexpr size_t warps_w = 2; + constexpr size_t warps_oc = 4; + constexpr size_t out_channels_per_warp = 4; + constexpr size_t oh_per_warp = 8; + constexpr size_t ic_unroll_size = 1; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (wo + WMMA_N * warps_w - 1) / (WMMA_N * warps_w); + int blocks_per_col = (ho + oh_per_warp - 1) / (oh_per_warp); + int blocks_per_out_channel = + (co + WMMA_M * warps_oc * out_channels_per_warp - 1) / + (WMMA_M * warps_oc * out_channels_per_warp); + + blockDim.x = WARP_SIZE * warps_w; + blockDim.y = warps_oc; + blockDim.z = 1; + + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = blocks_per_out_channel; + gridDim.z = batch_size; + + typedef BlockConfig + BlockConfig_; + _do_dispatch_reorder_kernel(BlockConfig_::IC_BLKS); + convolution_template_device_u4< + ConvConfig<5, 5, 1, 1>, + BlockConfig> + <<>>(d_data, workspace, d_out, + batch_size, hi, wi, ho, wo, + ph, pw, ci, co, zero); + } else if (fh == 7 && fw == 7 && sh == 1 && sw == 1) { + constexpr size_t warps_w = 2; + constexpr size_t warps_oc = 4; + constexpr size_t out_channels_per_warp = 4; + constexpr size_t oh_per_warp = 8; + constexpr size_t ic_unroll_size = 1; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (wo + WMMA_N * warps_w - 1) / (WMMA_N * warps_w); + int blocks_per_col = (ho + oh_per_warp - 1) / (oh_per_warp); + int blocks_per_out_channel = + (co + WMMA_M * warps_oc * out_channels_per_warp - 1) / + (WMMA_M * warps_oc * out_channels_per_warp); + + blockDim.x = WARP_SIZE * warps_w; + blockDim.y = warps_oc; + blockDim.z = 1; + + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = blocks_per_out_channel; + gridDim.z = batch_size; + + typedef BlockConfig + BlockConfig_; + _do_dispatch_reorder_kernel(BlockConfig_::IC_BLKS); + convolution_template_device_u4< + ConvConfig<7, 7, 1, 1>, + BlockConfig> + <<>>(d_data, workspace, d_out, + batch_size, hi, wi, ho, wo, + ph, pw, ci, co, zero); + } + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_fhxfw.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_fhxfw.cu new file mode 100644 index 00000000..22298260 --- /dev/null +++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_fhxfw.cu @@ -0,0 +1,694 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/conv_bias/quint4x4x32_wmma/wmma_conv_integer_u4_fhxfw.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include "src/cuda/utils.cuh" +#include "wmma_conv_integer_u4.cuh" + +#if __CUDA_ARCH__ >= 730 +using namespace nvcuda; +using namespace wmma::experimental::precision; +#endif + +using namespace megdnn; +using namespace cuda; +using namespace wmma_conv_integer_subbyte; + +namespace wmma_conv_integer_subbyte_fhxfw { + +template +struct BlockConfig { + static int const WARPS_W = WARPS_W_; + static int const WARPS_OC = WARPS_OC_; + static int const OUT_CHANNELS_PER_WARP = OUT_CHANNELS_PER_WARP_; + static int const OH_PER_WARP = OH_PER_WARP_; + static int const IC_UNROLL_SIZE = IC_UNROLL_SIZE_; + static int const IC_BLKS = IC_BLK * IC_UNROLL_SIZE; + static int const WARPS_PER_BLOCK = WARPS_W * WARPS_OC; +}; + +template +struct DataCount { + static int const LANE_SIZE = + BlockConfig::WARPS_W * WMMA_M * ConvConfig::SW + ConvConfig::FW - 1; + static int const LANES_PER_SLICE = + BlockConfig::OH_PER_WARP * ConvConfig::SH + ConvConfig::FH - 1; + static int const LANES_PER_BLOCK = + LANES_PER_SLICE * IC_BLK * BlockConfig::IC_UNROLL_SIZE; + static int const LANES_PER_WARP = + (LANES_PER_BLOCK + BlockConfig::WARPS_PER_BLOCK - 1) / + BlockConfig::WARPS_PER_BLOCK; + static int const SMEM_SKEW = (BlockConfig::IC_UNROLL_SIZE % 2 == 0) * SKEW; + static int const SMEM_DATA_COL = + (IC_BLK * BlockConfig::IC_UNROLL_SIZE * 8 + SMEM_SKEW) / 2; + static int const SMEM_DATA_STRIDE = SMEM_DATA_COL * 2; + static int const SMEM_DATA_ROW = LANE_SIZE * LANES_PER_SLICE; +}; + +template +struct FilterCount { + static int const OUT_CHANNELS_PER_BLOCK = + WMMA_M * BlockConfig::WARPS_OC * BlockConfig::OUT_CHANNELS_PER_WARP; + static int const SMEM_FILTER_ROW = OUT_CHANNELS_PER_BLOCK; + static int const SMEM_SKEW = + ((ConvConfig::FH * ConvConfig::FW * BlockConfig::IC_UNROLL_SIZE) % + 2 == + 0) * + SKEW; + static int const SMEM_FILTER_COL = + (BlockConfig::IC_BLKS * ConvConfig::FH * ConvConfig::FW * 8 + + SMEM_SKEW) / + 2; + static int const SMEM_FILTER_STRIDE = SMEM_FILTER_COL * 2; + static int const REG_FILTER_ROW = + (SMEM_FILTER_ROW + BlockConfig::WARPS_PER_BLOCK - 1) / + BlockConfig::WARPS_PER_BLOCK; + static int const REG_FILTER_COL = + (BlockConfig::IC_BLKS * ConvConfig::FH * ConvConfig::FW + + WARP_SIZE - 1) / + WARP_SIZE; +}; + +#if __CUDA_ARCH__ >= 730 +template +struct ConvDataGlobal2ShareMemVisitor { + typedef int32_t copy_t; + uint8_t* smem; + const uint8_t* g_ptr; + + int ci_stride, hi_stride; + int b_ih, b_iw; + int IH, IW; + copy_t zero; + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + const int tid_in_warp = threadIdx.x % WARP_SIZE; + const int warp_id = (warp_y * BlockConfig_::WARPS_W + warp_x); + + copy_t reg_cache[DataCount::LANES_PER_WARP]; + + __device__ ConvDataGlobal2ShareMemVisitor(uint8_t* smem, + const uint8_t* g_ptr, int IH, + int IW, int b_ih, int b_iw, + copy_t zero) + : smem{smem}, + g_ptr{g_ptr}, + b_ih{b_ih}, + b_iw{b_iw}, + IH{IH}, + IW{IW}, + zero{zero} { + ci_stride = 8 * IH * IW; + hi_stride = 8 * IW; + } + + // not perfectly + __device__ __forceinline__ void copy() { + int col = (tid_in_warp << 3); + // read input from global memory without boundary check +#pragma unroll + for (int i = 0; + i < DataCount::LANES_PER_WARP; ++i) { + int row = i * BlockConfig_::WARPS_PER_BLOCK + warp_id; + int ci_idx = + row / DataCount::LANES_PER_SLICE; + int hi_idx = + row - ci_idx * DataCount::LANES_PER_SLICE; + bool bounds = ((b_iw + tid_in_warp) >= 0) && + ((b_iw + tid_in_warp) < IW) && + ((b_ih + hi_idx) >= 0) && ((b_ih + hi_idx) < IH); + if (bounds) { + copy_t val = *(copy_t*)(&g_ptr[(ci_idx * ci_stride + + hi_idx * hi_stride + col) / + 2]); + reg_cache[i] = val; + } else { + reg_cache[i] = zero; + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; + i < DataCount::LANES_PER_WARP; ++i) { + if (tid_in_warp < DataCount::LANE_SIZE) { + int row = i * BlockConfig_::WARPS_PER_BLOCK + warp_id; + int ci_idx = + row / + DataCount::LANES_PER_SLICE; + int hi_idx = + row - ci_idx * DataCount::LANES_PER_SLICE; + int y = hi_idx * DataCount::LANE_SIZE + + tid_in_warp; + int x = ci_idx * 8; + *(copy_t*)(get_smem_ptr(y, x)) = reg_cache[i]; + } + } + } + + __device__ __forceinline__ uint8_t* get_smem_ptr(int y, int x) { + return &smem[(y * DataCount::SMEM_DATA_STRIDE + + x) / + 2]; + } + + __device__ __forceinline__ void inc_stage() { + g_ptr += BlockConfig_::IC_BLKS * ci_stride / 2; + } +}; + +template +struct ConvFilterGlobal2ShareMemVisitor { + uint8_t* smem; + const uint8_t* g_ptr; + + int co_stride, co_remain; + int idx; + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + const int tid_in_warp = threadIdx.x % WARP_SIZE; + const int warp_id = (warp_y * BlockConfig_::WARPS_W + warp_x); + + typedef int32_t copy_t; + copy_t reg_cache[FilterCount::REG_FILTER_ROW] + [FilterCount::REG_FILTER_COL]; + + __device__ ConvFilterGlobal2ShareMemVisitor(uint8_t* smem, + const uint8_t* g_ptr, + int co_stride, int co_remain, + int idx) + : smem{smem}, + g_ptr{g_ptr}, + co_stride{co_stride}, + co_remain{co_remain}, + idx{idx} {} + + __device__ __forceinline__ void copy() { + int ci_remain = + idx < BlockConfig_::IC_BLKS ? idx : BlockConfig_::IC_BLKS; +#pragma unroll + for (int i = 0; + i < FilterCount::REG_FILTER_ROW; ++i) { +#pragma unroll + for (int j = 0; + j < FilterCount::REG_FILTER_COL; + ++j) { + int y = BlockConfig_::WARPS_PER_BLOCK * i + warp_id; + int x = WARP_SIZE * j + tid_in_warp; + bool valid = + (x < ci_remain * ConvConfig_::FH * ConvConfig_::FW) && + (y < + FilterCount::OUT_CHANNELS_PER_BLOCK) && + (y < co_remain); + if (valid) { + copy_t val = *(copy_t*)(&g_ptr[y * co_stride + x * 4]); + reg_cache[i][j] = val; + } else { + reg_cache[i][j] = 0; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; + i < FilterCount::REG_FILTER_ROW; ++i) { +#pragma unroll + for (int j = 0; + j < FilterCount::REG_FILTER_COL; + ++j) { + int y = BlockConfig_::WARPS_PER_BLOCK * i + warp_id; + int x = WARP_SIZE * j + tid_in_warp; + int spatial_idx = x % (ConvConfig_::FH * ConvConfig_::FW); + int ci_blk = x / (ConvConfig_::FH * ConvConfig_::FW); + int ci_inner_blk = (ci_blk & 0x3); + int ci_outer_blk = (ci_blk >> 2); + int s_x = ci_outer_blk * IC_BLK * ConvConfig_::FH * + ConvConfig_::FW + + spatial_idx * IC_BLK + ci_inner_blk; + bool bounds = + (y < + FilterCount::OUT_CHANNELS_PER_BLOCK) && + (x < BlockConfig_::IC_BLKS * ConvConfig_::FH * + ConvConfig_::FW); + if (bounds) + *(copy_t*)get_smem_ptr(y, s_x * 8) = reg_cache[i][j]; + } + } + } + + __device__ __forceinline__ uint8_t* get_smem_ptr(int y, int x) { + return &smem[(y * FilterCount::SMEM_FILTER_STRIDE + + x) / + 2]; + } + + __device__ __forceinline__ void inc_stage() { + idx -= BlockConfig_::IC_BLKS; + g_ptr += BlockConfig_::IC_BLKS * ConvConfig_::FH * ConvConfig_::FW * 4; + } +}; + +template +__device__ inline void load_share_mem( + wmma::fragment + data_frag[BlockConfig_::OH_PER_WARP], + wmma::fragment + filter_frag[BlockConfig_::OUT_CHANNELS_PER_WARP], + ConvDataGlobal2ShareMemVisitor& + gbl2smem_data_visitor, + ConvFilterGlobal2ShareMemVisitor& + gbl2smem_filter_visitor, + int data_spatial_idx, int filter_spatial_idx, int ic_blk) { + const int warp_y = threadIdx.y; + uint8_t* __restrict__ s_ptr_data = gbl2smem_data_visitor.get_smem_ptr( + data_spatial_idx, ic_blk * WMMA_K); + uint8_t* __restrict__ s_ptr_filter = gbl2smem_filter_visitor.get_smem_ptr( + warp_y * WMMA_M, + ic_blk * WMMA_K * ConvConfig_::FH * ConvConfig_::FW + + filter_spatial_idx * WMMA_K); + +#pragma unroll + for (int i = 0; i < BlockConfig_::OH_PER_WARP; ++i) { + wmma::load_matrix_sync( + data_frag[i], + s_ptr_data + + i * DataCount::LANE_SIZE * + DataCount::SMEM_DATA_STRIDE / + 2, + DataCount::SMEM_DATA_STRIDE); + } +#pragma unroll + for (int j = 0; j < BlockConfig_::OUT_CHANNELS_PER_WARP; ++j) { + wmma::load_matrix_sync( + filter_frag[j], + s_ptr_filter + + j * WMMA_M * BlockConfig_::WARPS_OC * + FilterCount::SMEM_FILTER_STRIDE / + 2, + FilterCount::SMEM_FILTER_STRIDE); + } +} + +template +__device__ inline void +calc(wmma::fragment + data_frag[OH_PER_WARP], + wmma::fragment + filter_frag[OUT_CHANNELS_PER_WARP], + wmma::fragment + acc_frag[OUT_CHANNELS_PER_WARP][OH_PER_WARP]) { +#pragma unroll + for (int i = 0; i < OUT_CHANNELS_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < OH_PER_WARP; ++j) { + wmma::mma_sync(acc_frag[i][j], filter_frag[i], data_frag[j], + acc_frag[i][j]); + } + } +} + +template +__device__ void consume_slice( + ConvDataGlobal2ShareMemVisitor& + gbl2smem_data_visitor, + ConvFilterGlobal2ShareMemVisitor& + gbl2smem_filter_visitor, + wmma::fragment + data_frag[2][BlockConfig_::OH_PER_WARP], + wmma::fragment + filter_frag[2][BlockConfig_::OUT_CHANNELS_PER_WARP], + wmma::fragment + acc_frag[BlockConfig_::OUT_CHANNELS_PER_WARP] + [BlockConfig_::OH_PER_WARP]) { + if (!last_slice) { + gbl2smem_data_visitor.inc_stage(); + gbl2smem_filter_visitor.inc_stage(); + gbl2smem_data_visitor.copy(); + gbl2smem_filter_visitor.copy(); + } + + int data_spatial_idx_base = threadIdx.x / WARP_SIZE * WMMA_N; + int loop_count = 0; +#pragma unroll + for (; loop_count < + BlockConfig_::IC_UNROLL_SIZE * ConvConfig_::FH * ConvConfig_::FW - 1; + loop_count++) { + calc( + data_frag[loop_count % 2], filter_frag[loop_count % 2], + acc_frag); + + int filter_spatial_idx = + (loop_count + 1) % (ConvConfig_::FH * ConvConfig_::FW); + int ic_blk = (loop_count + 1) / (ConvConfig_::FH * ConvConfig_::FW); + int fh = filter_spatial_idx / ConvConfig_::FW; + int fw = filter_spatial_idx % ConvConfig_::FW; + int data_spatial_idx = + data_spatial_idx_base + + fh * DataCount::LANE_SIZE + fw; + load_share_mem( + data_frag[(loop_count + 1) % 2], + filter_frag[(loop_count + 1) % 2], gbl2smem_data_visitor, + gbl2smem_filter_visitor, data_spatial_idx, filter_spatial_idx, + ic_blk); + } + + calc( + data_frag[(loop_count % 2)], filter_frag[(loop_count % 2)], + acc_frag); + if (!last_slice) { + __syncthreads(); + gbl2smem_data_visitor.commit(); + gbl2smem_filter_visitor.commit(); + __syncthreads(); + load_share_mem( + data_frag[0], filter_frag[0], gbl2smem_data_visitor, + gbl2smem_filter_visitor, data_spatial_idx_base, 0, 0); + } +} + +#if 0 +template +__device__ void consume_slice_no_reg_cache( + ConvDataGlobal2ShareMemVisitor& + gbl2smem_data_visitor, + ConvFilterGlobal2ShareMemVisitor& + gbl2smem_filter_visitor, + wmma::fragment + data_frag[BlockConfig_::OH_PER_WARP], + wmma::fragment + filter_frag[BlockConfig_::OUT_CHANNELS_PER_WARP], + wmma::fragment + acc_frag[BlockConfig_::OUT_CHANNELS_PER_WARP] + [BlockConfig_::OH_PER_WARP]) { + if (!last_slice) { + gbl2smem_data_visitor.inc_stage(); + gbl2smem_filter_visitor.inc_stage(); + gbl2smem_data_visitor.copy(); + gbl2smem_filter_visitor.copy(); + } + + int data_spatial_idx_base = threadIdx.x / WARP_SIZE * WMMA_N; + int loop_count = 0; +#pragma unroll + for (; loop_count < + BlockConfig_::IC_UNROLL_SIZE * ConvConfig_::FH * ConvConfig_::FW; + loop_count++) { + int filter_spatial_idx = + (loop_count + 0) % (ConvConfig_::FH * ConvConfig_::FW); + int ic_blk = (loop_count + 0) / (ConvConfig_::FH * ConvConfig_::FW); + int fh = filter_spatial_idx / ConvConfig_::FW; + int fw = filter_spatial_idx % ConvConfig_::FW; + int data_spatial_idx = + data_spatial_idx_base + + fh * DataCount::LANE_SIZE + fw; + + load_share_mem( + data_frag, filter_frag, gbl2smem_data_visitor, + gbl2smem_filter_visitor, data_spatial_idx, filter_spatial_idx, + ic_blk); + calc( + data_frag, filter_frag, acc_frag); + } + + if (!last_slice) { + __syncthreads(); + gbl2smem_data_visitor.commit(); + gbl2smem_filter_visitor.commit(); + __syncthreads(); + } +} +#endif + +template +__global__ void convolution_template_device_u4( + const uint8_t* __restrict__ data, const uint8_t* __restrict__ filter, + int32_t* __restrict__ out, int N, int IH, int IW, int OH, int OW, + int PH, int PW, int IC, int OC, int32_t zero) { + constexpr size_t IC_BLKS = BlockConfig_::IC_BLKS; + constexpr size_t OUT_CHANNELS_PER_BLOCK = + FilterCount::OUT_CHANNELS_PER_BLOCK; + + const int blocks_per_row = (OW + WMMA_N * BlockConfig_::WARPS_W - 1) / + (WMMA_N * BlockConfig_::WARPS_W); + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + const int b_oh = bidx / blocks_per_row * BlockConfig_::OH_PER_WARP; + const int b_ow = bidx % blocks_per_row * (WMMA_N * BlockConfig_::WARPS_W); + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + + const int oc_start = bidy * OUT_CHANNELS_PER_BLOCK + warp_y * WMMA_M; + const int ow_start = b_ow + warp_x * WMMA_N; + const int b_ih = b_oh * ConvConfig_::SH - PH; + const int b_iw = b_ow * ConvConfig_::SW - PW; + + const uint8_t* __restrict__ g_ptr_data = + data + bidz * IC * IH * IW / 2 + (b_ih * IW + b_iw) * 8 / 2; + const uint8_t* __restrict__ g_ptr_filter = + filter + bidy * OUT_CHANNELS_PER_BLOCK * ConvConfig_::FH * + ConvConfig_::FW * IC / 2; + const int co_remain = OC - bidy * OUT_CHANNELS_PER_BLOCK; + int32_t* __restrict__ g_ptr_out = out + bidz * OC * OH * OW + + oc_start * OH * OW + + (b_oh * OW + ow_start) * WMMA_M; + const int icb = IC / 8; + + __shared__ uint8_t + smem_data[DataCount::SMEM_DATA_ROW] + [DataCount::SMEM_DATA_COL]; + __shared__ uint8_t smem_filter + [FilterCount::SMEM_FILTER_ROW] + [FilterCount::SMEM_FILTER_COL]; + + wmma::fragment + acc_frag[BlockConfig_::OUT_CHANNELS_PER_WARP] + [BlockConfig_::OH_PER_WARP]; + wmma::fragment + data_frag[2][BlockConfig_::OH_PER_WARP]; + wmma::fragment + filter_frag[2][BlockConfig_::OUT_CHANNELS_PER_WARP]; + + ConvDataGlobal2ShareMemVisitor + gbl2smem_data_visitor{smem_data[0], g_ptr_data, IH, IW, + b_ih, b_iw, zero}; + ConvFilterGlobal2ShareMemVisitor + gbl2smem_filter_visitor{smem_filter[0], g_ptr_filter, + IC / 2 * ConvConfig_::FH * ConvConfig_::FW, + co_remain, icb}; + +#pragma unroll + for (int i = 0; i < BlockConfig_::OUT_CHANNELS_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < BlockConfig_::OH_PER_WARP; ++j) { + wmma::fill_fragment(acc_frag[i][j], 0); + } + } + + gbl2smem_data_visitor.copy(); + gbl2smem_filter_visitor.copy(); + gbl2smem_data_visitor.commit(); + gbl2smem_filter_visitor.commit(); + __syncthreads(); + + load_share_mem( + data_frag[0], filter_frag[0], gbl2smem_data_visitor, + gbl2smem_filter_visitor, warp_x * WMMA_N, 0, 0); + + int ic_blocks = (icb + IC_BLKS - 1) / IC_BLKS - 1; +#pragma unroll + for (int ci_blk = 0; ci_blk < ic_blocks; ci_blk++) { + consume_slice( + gbl2smem_data_visitor, gbl2smem_filter_visitor, data_frag, + filter_frag, acc_frag); + } + consume_slice( + gbl2smem_data_visitor, gbl2smem_filter_visitor, data_frag, + filter_frag, acc_frag); + + // store +#pragma unroll + for (int i = 0; i < BlockConfig_::OUT_CHANNELS_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < BlockConfig_::OH_PER_WARP; ++j) { + if (b_oh + j < OH && + oc_start + i * BlockConfig_::WARPS_OC * WMMA_M < OC && + ow_start < OW) { + wmma::store_matrix_sync(&g_ptr_out[i * BlockConfig_::WARPS_OC * + WMMA_M * OH * OW + + j * OW * WMMA_M], + acc_frag[i][j], WMMA_M, + wmma::mem_col_major); + } + } + } +} +#else +template +__global__ void convolution_template_device_u4( + const uint8_t* __restrict__ /* data */, + const uint8_t* __restrict__ /* filter */, + int32_t* __restrict__ /* out */, int /* N */, int /* IH */, + int /* IW */, int /* OH */, int /* OW */, int /* PH */, int /* PW */, + int /* IC */, int /* OC */, int32_t /* zero */) {} +#endif +} // namespace wmma_conv_integer_subbyte_fhxfw + +using namespace wmma_conv_integer_subbyte_fhxfw; + +void megdnn::cuda::wmma_conv_integer_subbyte:: + _do_wmma_conv_integer_subbyte_fhxfw( + const uint8_t* d_data, const uint8_t* d_filter, int32_t* d_out, + int batch_size, int hi, int wi, int ho, int wo, int ph, int pw, + int ci, int co, int fh, int fw, int sh, int sw, uint8_t zp_data, + cudaStream_t stream) { + cuda_check(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); + cuda_check(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); + zp_data = (zp_data << 4) | zp_data; + int32_t zero = (zp_data << 24) | (zp_data << 16) | (zp_data << 8) | zp_data; + if (fh == 3 && fw == 3 && sh == 1 && sw == 1) { + constexpr size_t warps_w = 2; + constexpr size_t warps_oc = 4; + constexpr size_t out_channels_per_warp = 2; + constexpr size_t oh_per_warp = 8; + constexpr size_t ic_unroll_size = 2; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (wo + WMMA_N * warps_w - 1) / (WMMA_N * warps_w); + int blocks_per_col = (ho + oh_per_warp - 1) / (oh_per_warp); + int blocks_per_out_channel = + (co + WMMA_M * warps_oc * out_channels_per_warp - 1) / + (WMMA_M * warps_oc * out_channels_per_warp); + + blockDim.x = WARP_SIZE * warps_w; + blockDim.y = warps_oc; + blockDim.z = 1; + + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = blocks_per_out_channel; + gridDim.z = batch_size; + + convolution_template_device_u4< + ConvConfig<3, 3, 1, 1>, + BlockConfig> + <<>>(d_data, d_filter, d_out, + batch_size, hi, wi, ho, wo, + ph, pw, ci, co, zero); + } else if (fh == 5 && fw == 5 && sh == 1 && sw == 1) { + constexpr size_t warps_w = 2; + constexpr size_t warps_oc = 4; + constexpr size_t out_channels_per_warp = 2; + constexpr size_t oh_per_warp = 8; + constexpr size_t ic_unroll_size = 1; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (wo + WMMA_N * warps_w - 1) / (WMMA_N * warps_w); + int blocks_per_col = (ho + oh_per_warp - 1) / (oh_per_warp); + int blocks_per_out_channel = + (co + WMMA_M * warps_oc * out_channels_per_warp - 1) / + (WMMA_M * warps_oc * out_channels_per_warp); + + blockDim.x = WARP_SIZE * warps_w; + blockDim.y = warps_oc; + blockDim.z = 1; + + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = blocks_per_out_channel; + gridDim.z = batch_size; + + convolution_template_device_u4< + ConvConfig<5, 5, 1, 1>, + BlockConfig> + <<>>(d_data, d_filter, d_out, + batch_size, hi, wi, ho, wo, + ph, pw, ci, co, zero); + } else if (fh == 7 && fw == 7 && sh == 1 && sw == 1) { + constexpr size_t warps_w = 2; + constexpr size_t warps_oc = 2; + constexpr size_t out_channels_per_warp = 2; + constexpr size_t oh_per_warp = 4; + constexpr size_t ic_unroll_size = 1; + + dim3 gridDim; + dim3 blockDim; + int blocks_per_row = (wo + WMMA_N * warps_w - 1) / (WMMA_N * warps_w); + int blocks_per_col = (ho + oh_per_warp - 1) / (oh_per_warp); + int blocks_per_out_channel = + (co + WMMA_M * warps_oc * out_channels_per_warp - 1) / + (WMMA_M * warps_oc * out_channels_per_warp); + + blockDim.x = WARP_SIZE * warps_w; + blockDim.y = warps_oc; + blockDim.z = 1; + + gridDim.x = blocks_per_row * blocks_per_col; + gridDim.y = blocks_per_out_channel; + gridDim.z = batch_size; + + convolution_template_device_u4< + ConvConfig<7, 7, 1, 1>, + BlockConfig> + <<>>(d_data, d_filter, d_out, + batch_size, hi, wi, ho, wo, + ph, pw, ci, co, zero); + } + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/algo.cpp b/dnn/src/cuda/convolution/backward_data/algo.cpp new file mode 100644 index 00000000..5ef94ebb --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/algo.cpp @@ -0,0 +1,111 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { + non_cudnn_algos.push_back(&chanwise); + non_cudnn_algos.push_back(&chanwise_small); + non_cudnn_algos.push_back(&matmul); + + all_algos.push_back(&chanwise); // prefer chanwise + all_algos.push_back(&chanwise_small); // prefer small chanwise + + fill_cudnn_algos(); + for (auto &&i: cudnn) { + all_algos.push_back(&i); + } + all_algos.push_back(&matmul); + + all_algos.reserve(all_algos.size() * 2); + + // add gconv algos by AlgoGroupConvGeneral + auto all_algos_data = all_algos.data(); + for (size_t i = 2; i < all_algos.size(); ++ i) { + gconv.push_back({all_algos[i]}); + } + for (size_t i = 2; i < all_algos.size(); ++ i) { + algo2gconv[all_algos[i]] = &gconv[i - 2]; + } + for (auto &&i: gconv) { + all_algos.push_back(&i); + } + megdnn_assert(all_algos_data == all_algos.data()); + + non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul +} + +ConvolutionBackwardDataImpl::AlgoCUDNN* +ConvolutionBackwardDataImpl::AlgoPack::cudnn_from_enum( + cudnnConvolutionBwdDataAlgo_t algo) { + for (auto &&i: cudnn) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw(megdnn_mangle(ssprintf( + "can not find cudnn bwd_data algorithm %d", + static_cast(algo)))); +} + +ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; + +ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( + ConvolutionBackwardDataImpl *o, + const TensorLayout &filter, const TensorLayout &diff, + const TensorLayout &grad): + SizeArgs(o, o->check_layout_fwd(grad, filter, diff), diff, grad) +{ +} + +ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( + ConvolutionBackwardDataImpl *o, + const CanonizedFilterMeta &filter, const TensorLayout &diff, + const TensorLayout &grad): + handle{concrete_handle(o->handle())}, + filter_meta{filter}, + diff_layout{&diff}, + grad_layout{&grad}, + opr{o} +{ +} + +ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( + ConvolutionBackwardDataImpl *opr, + _megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace): + SizeArgs(opr, filter.layout, diff.layout, grad.layout), + filter_tensor{&filter}, diff_tensor{&diff}, grad_tensor{&grad}, + workspace{workspace} +{ +} + +std::string ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_string() const { + auto &&fm = filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return megdnn_mangle(ssprintf( + "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", + fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], + diff_layout->to_string().c_str(), + grad_layout->to_string().c_str(), + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], + !fm.should_flip, + diff_layout->dtype.name(), grad_layout->dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h new file mode 100644 index 00000000..0a97f17d --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -0,0 +1,226 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/convolution/helper.h" +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for convolution algos + * + * All the algo impls should try to support non-contiguous batch dim, for group + * conv execution. + */ +class ConvolutionBackwardDataImpl::AlgoBase: public Algorithm { + protected: + ~AlgoBase() = default; + + public: + struct SizeArgs { + HandleImpl *handle; + CanonizedFilterMeta filter_meta; + const TensorLayout *diff_layout, *grad_layout; + ConvolutionBackwardDataImpl *opr; + + std::string to_string() const; + void init_desc(convolution::CUDNNBwdDataDescs &desc) const { + desc.set(filter_meta, *diff_layout, *grad_layout, opr->param()); + } + SizeArgs(ConvolutionBackwardDataImpl *opr, + const TensorLayout &filter, const TensorLayout &diff, + const TensorLayout &grad); + SizeArgs(ConvolutionBackwardDataImpl *opr, + const CanonizedFilterMeta &filter, const TensorLayout &diff, + const TensorLayout &grad); + + convolution::ForwardSizeArgs as_fwd_args() const { + return {handle, grad_layout, filter_meta, diff_layout}; + } + }; + struct ExecArgs: public SizeArgs { + const TensorND *filter_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(ConvolutionBackwardDataImpl *opr, + _megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs &args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs &args) const = 0; + virtual void exec(const ExecArgs &args) const = 0; + + bool is_available_wk(const SizeArgs &args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + + AlgoBase& check_workspace( + const SizeArgs &args, const Workspace &workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "conv bwd data algo %s: " + "required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + virtual bool is_cudnn() const { + return false; + } +}; + +class ConvolutionBackwardDataImpl::AlgoCUDNN final : public AlgoBase { + bool m_is_reproducible; + const char *m_name; + cudnnConvolutionBwdDataAlgo_t m_cudnn_enum; + + public: + + AlgoCUDNN(bool is_reproducible, const char *name, + cudnnConvolutionBwdDataAlgo_t cudnn_enum): + m_is_reproducible(is_reproducible), + m_name(name), + m_cudnn_enum(cudnn_enum) + {} + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + bool is_reproducible() const override { + return m_is_reproducible; + } + + const char* name() const override { + return m_name; + } + + cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { + return m_cudnn_enum; + } + + bool is_cudnn() const override { + return true; + } +}; + +//! im2col and matmul, with dilation +class ConvolutionBackwardDataImpl::AlgoMatmul final: public AlgoBase { + template + static void exec_internal(const ExecArgs &args); + + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "MATMUL"; + } + bool is_reproducible() const override { + return true; + } +}; + +class ConvolutionBackwardDataImpl::AlgoChanwise final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "CHANNEL_WISE"; + } + bool is_reproducible() const override { + return true; + } +}; + +class ConvolutionBackwardDataImpl::AlgoChanwiseSmall final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "CHANNEL_WISE_SMALL"; + } + bool is_reproducible() const override { + return true; + } +}; + +//! implement group conv by another algo +class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final: public AlgoBase { + AlgoBase *m_impl; + std::string m_name; + + public: + AlgoGroupConvGeneral(AlgoBase *impl); + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return m_name.c_str(); + } + + bool is_reproducible() const override { + return m_impl->is_reproducible(); + } + + static void modify_size_args(SizeArgs &args, + TensorLayout &diff_pg, TensorLayout &grad_pg); +}; + +class ConvolutionBackwardDataImpl::AlgoPack { + // defined in cudnn.cpp + void fill_cudnn_algos(); + + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator = (const AlgoPack &) = delete; + + public: + AlgoPack(); + + std::vector cudnn; + AlgoMatmul matmul; + AlgoChanwise chanwise; + AlgoChanwiseSmall chanwise_small; + std::vector gconv; + std::unordered_map algo2gconv; + + std::vector + //! all algorithms + all_algos, + //! non-cudnn algos, used for heuristic if cudnn is not supported + non_cudnn_algos; + + AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/chanwise.cpp b/dnn/src/cuda/convolution/backward_data/chanwise.cpp new file mode 100644 index 00000000..d2fc6249 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/chanwise.cpp @@ -0,0 +1,73 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/chanwise.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/convolution/chanwise/kern.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool ConvolutionBackwardDataImpl::AlgoChanwise::is_available( + const SizeArgs& args) const { + auto&& fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && !fm.should_flip; +} + +size_t ConvolutionBackwardDataImpl::AlgoChanwise::get_workspace_in_bytes( + const SizeArgs&) const { + return 0; +} + +void ConvolutionBackwardDataImpl::AlgoChanwise::exec( + const ExecArgs& args) const { + auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); + auto stream = cuda_stream(args.handle); + switch (args.diff_layout->dtype.enumv()) { + case DTypeEnum::Float32: + return chanwise::run_bwd_data(args.grad_tensor->ptr(), + args.diff_tensor->ptr(), + args.filter_tensor->ptr(), + kparam, stream); + + case DTypeEnum::Float16: +#if CUDA_VERSION >= 9000 + if (is_compute_capability_required(5, 3)) { + return chanwise::run_bwd_data( + static_cast<__half*>(args.grad_tensor->raw_ptr), + static_cast<__half*>(args.diff_tensor->raw_ptr), + static_cast<__half*>(args.filter_tensor->raw_ptr), + kparam, stream); + } else { + return chanwise::run_bwd_data( + args.grad_tensor->ptr(), + args.diff_tensor->ptr(), + args.filter_tensor->ptr(), kparam, stream); + } +#else + return chanwise::run_bwd_data(args.grad_tensor->ptr(), + args.diff_tensor->ptr(), + args.filter_tensor->ptr(), + kparam, stream); +#endif + + default: + break; + } + megdnn_assert_internal(0); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp new file mode 100644 index 00000000..562644be --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/chanwise_small.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/convolution/backward_data/algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution/chanwise/kern.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +namespace { +inline bool is_available_small(const chanwise::Param& param) { + return param.chl_mul == 1 && param.stride_h == 1 && param.stride_w == 1 && + param.src_h <= 32 && param.src_w <= 32 && + param.src_h == param.out_h && param.src_w == param.out_w && + param.pad_h < param.flt_h && param.pad_w < param.flt_w && + param.flt_h * param.flt_w <= (param.src_h + 1) / 2 * param.src_w; +} +} // anonymous namespace + +bool ConvolutionBackwardDataImpl::AlgoChanwiseSmall::is_available( + const SizeArgs &args) const { +#if CUDA_VERSION < 9000 + if (args.diff_layout->dtype.enumv() == DTypeEnum::Float16) + return false; +#endif + auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); + auto &&fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && + fm.spatial_ndim == 2 && fm.icpg == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && + !fm.should_flip && is_available_small(kparam); +} + +size_t ConvolutionBackwardDataImpl::AlgoChanwiseSmall::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec( + const ExecArgs &args) const { + auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); + auto stream = cuda_stream(args.handle); + switch (args.grad_layout->dtype.enumv()) { + case DTypeEnum::Float32: + return chanwise::run_bwd_data_small(args.grad_tensor->ptr(), + args.diff_tensor->ptr(), + args.filter_tensor->ptr(), kparam, + stream); +#if CUDA_VERSION >= 9000 + case DTypeEnum::Float16: + return chanwise::run_bwd_data_small( + static_cast(args.grad_tensor->raw_ptr), + static_cast(args.diff_tensor->raw_ptr), + static_cast(args.filter_tensor->raw_ptr), kparam, + stream); +#endif + default: + break; + } + megdnn_assert_internal(0); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution/backward_data/cudnn.cpp b/dnn/src/cuda/convolution/backward_data/cudnn.cpp new file mode 100644 index 00000000..c70c1ca3 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/cudnn.cpp @@ -0,0 +1,132 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/cudnn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +#include "src/cuda/utils.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/convolution/helper.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available( + const SizeArgs &args) const { + CUDNNBwdDataDescs D; + + if (!is_cudnn_supported(args.as_fwd_args())) + return false; + +#if CUDNN_VERSION >= 7500 + // As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with + // TensorCore operations produces incorrect result. So we disable + // this algo. Please remove the following code, when + // nvidia has fixed this issue. + // incorrect case: + // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, + // dtype=float16 + if (args.filter_meta.dtype == dtype::Float16()) { + const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1"; + auto cmp_len = strlen(algo_1); + if (is_compute_capability_required(7, 0) && + strncmp(name(), algo_1, cmp_len) == 0) { + return false; + } + } +#endif + + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle->cudnn_handle(), + D.filter_desc.desc, + D.diff_desc.desc, + D.conv_desc.desc, + D.grad_desc.desc, + m_cudnn_enum, + &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +size_t ConvolutionBackwardDataImpl::AlgoCUDNN::get_workspace_in_bytes( + const SizeArgs &args) const { + CUDNNBwdDataDescs D; + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle->cudnn_handle(), + D.filter_desc.desc, + D.diff_desc.desc, + D.conv_desc.desc, + D.grad_desc.desc, + m_cudnn_enum, + &workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_data get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + return workspace_size; +} + +void ConvolutionBackwardDataImpl::AlgoCUDNN::exec( + const ExecArgs &args) const { + CUDNNBwdDataDescs D; + args.init_desc(D); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionBackwardData(args.handle->cudnn_handle(), + &alpha, + D.filter_desc.desc, args.filter_tensor->raw_ptr, + D.diff_desc.desc, args.diff_tensor->raw_ptr, + D.conv_desc.desc, + m_cudnn_enum, + args.workspace.raw_ptr, + args.workspace.size, + &beta, + D.grad_desc.desc, + args.grad_tensor->raw_ptr); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_data failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); +} + +void ConvolutionBackwardDataImpl::AlgoPack::fill_cudnn_algos() { +#define V1(v) #v +#define V(v) V1(v) + +#define DEF_ALGO(NAME, REPROD) \ + cudnn.push_back({ \ + REPROD, #NAME \ + "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) \ + "." V(CUDNN_PATCHLEVEL), \ + NAME}) + + DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, true); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true); +#if CUDNN_MAJOR >= 5 + DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true); +#if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1 + DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, true); +#endif +#endif + +#if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) +#pragma message "not latest cudnn" +#endif + +#undef DEF_ALGO + +#undef V +#undef V1 +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/group_conv.cpp b/dnn/src/cuda/convolution/backward_data/group_conv.cpp new file mode 100644 index 00000000..2e60eb98 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/group_conv.cpp @@ -0,0 +1,82 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/group_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( + ConvolutionBackwardDataImpl::AlgoBase::SizeArgs &args, + TensorLayout &diff_pg, TensorLayout &grad_pg) { + diff_pg = *args.diff_layout; + grad_pg = *args.grad_layout; + auto nr_grp = args.filter_meta.group; + args.filter_meta.group = 1; + diff_pg.shape[1] /= nr_grp; + grad_pg.shape[1] /= nr_grp; + args.diff_layout = &diff_pg; + args.grad_layout = &grad_pg; +} + +ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( + AlgoBase *impl): + m_impl{impl} +{ + m_name = "group_conv:"; + m_name += impl->name(); +} + +bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( + const SizeArgs &args) const { + auto sub_args = args; + TensorLayout diff_pg, grad_pg; + modify_size_args(sub_args, diff_pg, grad_pg); + return m_impl->is_available(sub_args); +} + +size_t ConvolutionBackwardDataImpl::AlgoGroupConvGeneral:: +get_workspace_in_bytes(const SizeArgs &args) const { + auto sub_args = args; + TensorLayout diff_pg, grad_pg; + modify_size_args(sub_args, diff_pg, grad_pg); + return m_impl->get_workspace_in_bytes(sub_args); +} + +void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( + const ExecArgs &args) const { + auto sub_args = args; + TensorND tflt{*args.filter_tensor}, tdiff{*args.diff_tensor}, + tgrad{*args.grad_tensor}; + modify_size_args(sub_args, tdiff.layout, tgrad.layout); + sub_args.filter_tensor = &tflt; + sub_args.diff_tensor = &tdiff; + sub_args.grad_tensor = &tgrad; + auto grp = args.filter_meta.group; + + auto &&fm = args.filter_meta; + auto strd_flt = (fm.icpg * fm.ocpg * + fm.spatial[0] * fm.spatial[1] * tflt.layout.dtype.size()), + strd_diff = ( + tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), + strd_grad = ( + tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); + for (uint32_t g = 0; g < grp; ++ g) { + m_impl->exec(sub_args); + incr_voidp(tflt.raw_ptr, strd_flt); + incr_voidp(tdiff.raw_ptr, strd_diff); + incr_voidp(tgrad.raw_ptr, strd_grad); + } +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution/backward_data/matmul.cpp b/dnn/src/cuda/convolution/backward_data/matmul.cpp new file mode 100644 index 00000000..1a873f1d --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/matmul.cpp @@ -0,0 +1,122 @@ +/** + * \file dnn/src/cuda/convolution/backward_data/matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution/helper.h" +#include "src/cuda/convolution/im2col.cuh" +#include "src/cuda/matrix_mul/opr_impl.h" + +using namespace megdnn; +using namespace cuda; + +bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available( + const SizeArgs &args) const { + auto &&fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + fm.group == 1 && fm.spatial_ndim == 2; +} + +size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes( + const SizeArgs &args) const { + return matmul_get_workspace_bundle( + args.as_fwd_args()).total_size_in_bytes(); +} + +void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const { +#define cb(DType) \ + if (args.diff_layout->dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_internal(args); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + + megdnn_assert_internal(0); +} + +template +void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal( + const ExecArgs &args) { + auto &&fm = args.filter_meta; + size_t N = args.grad_layout->shape[0], + IC = fm.icpg, + IH = args.grad_layout->shape[2], + IW = args.grad_layout->shape[3], + OC = fm.ocpg, + OH = args.diff_layout->shape[2], + OW = args.diff_layout->shape[3], + FH = fm.spatial[0], + FW = fm.spatial[1], + PH = fm.padding[0], + PW = fm.padding[1], + SH = fm.stride[0], + SW = fm.stride[1], + DH = fm.dilation[0], + DW = fm.dilation[1]; + auto stream = cuda_stream(args.handle); + auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); + wbundle.set(args.workspace.raw_ptr); + T *diff_t = static_cast(wbundle.get(0)); + T *col = static_cast(wbundle.get(1)); + { + // transpose diff + TensorLayout froml({N, OC*OH*OW}, typename DTypeTrait::dtype()), + tol(froml); + froml.stride[0] = args.diff_layout->stride[0]; + tol.stride[0] = 1; + tol.stride[1] = N; + TensorND from(args.diff_tensor->ptr(), froml), + to(diff_t, tol); + args.handle->relayout_opr()->exec(from, to); + } + { + // take gemm grad + TensorLayout Al({OC, IC*FH*FW}, typename DTypeTrait::dtype()), + Bl({IC*FH*FW, OH*OW*N}, typename DTypeTrait::dtype()), + Cl({OC, OH*OW*N}, typename DTypeTrait::dtype()); + TensorND A(args.filter_tensor->ptr(), Al), + B(col, Bl), + C(diff_t, Cl); + if (fm.should_flip) { + convolution::flip_filter(args.as_fwd_args(), + wbundle.get_workspace(2), A.raw_ptr); + } + auto&& matmul_opr = args.handle->create_operator(); + if (args.opr->param().compute_mode == + param::Convolution::ComputeMode::FLOAT32) { + matmul_opr->param().compute_mode = + param::MatrixMul::ComputeMode::FLOAT32; + } + matmul_opr->param().transposeA = true; + megdnn_assert(matmul_opr->get_workspace_in_bytes(A.layout, C.layout, + B.layout) == 0_z, + "Assume matmul opr in algo MATMUL doesn't need extra " + "workspace"); + matmul_opr->exec(A, C, B, Workspace()); + } + { + // col2im + convolution::col2im(col, args.grad_tensor->ptr(), + N, args.grad_layout->stride[0], + IC, IH, IW, + FH, FW, + OH, OW, + PH, PW, + SH, SW, + DH, DW, + stream); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_filter/algo.cpp b/dnn/src/cuda/convolution/backward_filter/algo.cpp new file mode 100644 index 00000000..fdffefa8 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_filter/algo.cpp @@ -0,0 +1,111 @@ +/** + * \file dnn/src/cuda/convolution/backward_filter/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() { + non_cudnn_algos.push_back(&chanwise); + non_cudnn_algos.push_back(&matmul); + + all_algos.push_back(&chanwise); // prefer chanwise + + fill_cudnn_algos(); + for (auto &&i: cudnn) { + all_algos.push_back(&i); + } + all_algos.push_back(&matmul); + + all_algos.reserve(all_algos.size() * 2); + + // add gconv algos by AlgoGroupConvGeneral + auto all_algos_data = all_algos.data(); + for (size_t i = 1; i < all_algos.size(); ++ i) { + gconv.push_back({all_algos[i]}); + } + for (size_t i = 1; i < all_algos.size(); ++ i) { + algo2gconv[all_algos[i]] = &gconv[i - 1]; + } + for (auto &&i: gconv) { + all_algos.push_back(&i); + } + megdnn_assert(all_algos_data == all_algos.data()); + + non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul +} + +ConvolutionBackwardFilterImpl::AlgoCUDNN* +ConvolutionBackwardFilterImpl::AlgoPack::cudnn_from_enum( + cudnnConvolutionBwdFilterAlgo_t algo) { + for (auto &&i: cudnn) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw(megdnn_mangle(ssprintf( + "can not find cudnn bwd_filter algorithm %d", + static_cast(algo)))); +} + +ConvolutionBackwardFilterImpl::AlgoPack +ConvolutionBackwardFilterImpl::sm_algo_pack; + +ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( + ConvolutionBackwardFilterImpl *o, + const TensorLayout &src, const TensorLayout &diff, + const TensorLayout &grad): + SizeArgs(o, src, diff, o->check_layout_fwd(src, grad, diff)) +{ +} + +ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( + ConvolutionBackwardFilterImpl *o, + const TensorLayout &src, const TensorLayout &diff, + const CanonizedFilterMeta &grad): + handle{concrete_handle(o->handle())}, + src_layout{&src}, + diff_layout{&diff}, + grad_filter_meta{grad}, + opr{o} +{ +} + +ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs( + ConvolutionBackwardFilterImpl *opr, + _megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace): + SizeArgs(opr, src.layout, diff.layout, grad.layout), + src_tensor{&src}, diff_tensor{&diff}, grad_tensor{&grad}, + workspace{workspace} +{ +} + +std::string +ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_string() const { + auto &&fm = grad_filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return megdnn_mangle(ssprintf( + "src=%s diff=%s grad_filter=%u{%u,%u,%u,%u}, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", + src_layout->to_string().c_str(), + diff_layout->to_string().c_str(), + fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], + !fm.should_flip, + src_layout->dtype.name(), diff_layout->dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_filter/algo.h b/dnn/src/cuda/convolution/backward_filter/algo.h new file mode 100644 index 00000000..c1a25860 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_filter/algo.h @@ -0,0 +1,212 @@ +/** + * \file dnn/src/cuda/convolution/backward_filter/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/convolution/helper.h" +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for convolution algos + * + * All the algo impls should try to support non-contiguous batch dim, for group + * conv execution. + */ +class ConvolutionBackwardFilterImpl::AlgoBase: public Algorithm { + protected: + ~AlgoBase() = default; + + public: + struct SizeArgs { + HandleImpl *handle; + const TensorLayout *src_layout, *diff_layout; + CanonizedFilterMeta grad_filter_meta; + ConvolutionBackwardFilterImpl *opr; + + std::string to_string() const; + void init_desc(convolution::CUDNNBwdFilterDescs &desc) const { + desc.set(*src_layout, *diff_layout, grad_filter_meta, + opr->param()); + } + SizeArgs(ConvolutionBackwardFilterImpl *opr, + const TensorLayout &src, const TensorLayout &diff, + const TensorLayout &grad); + SizeArgs(ConvolutionBackwardFilterImpl *opr, + const TensorLayout &src, const TensorLayout &diff, + const CanonizedFilterMeta &grad); + + convolution::ForwardSizeArgs as_fwd_args() const { + return {handle, src_layout, grad_filter_meta, diff_layout}; + } + }; + struct ExecArgs: public SizeArgs { + const TensorND *src_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(ConvolutionBackwardFilterImpl *opr, + _megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs &args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs &args) const = 0; + virtual void exec(const ExecArgs &args) const = 0; + + bool is_available_wk(const SizeArgs &args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + + AlgoBase& check_workspace( + const SizeArgs &args, const Workspace &workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "conv bwd filter algo %s: " + "required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + virtual bool is_cudnn() const { + return false; + } +}; + +class ConvolutionBackwardFilterImpl::AlgoCUDNN final : public AlgoBase { + bool m_is_reproducible; + const char *m_name; + cudnnConvolutionBwdFilterAlgo_t m_cudnn_enum; + + public: + + AlgoCUDNN(bool is_reproducible, const char *name, + cudnnConvolutionBwdFilterAlgo_t cudnn_enum): + m_is_reproducible(is_reproducible), + m_name(name), + m_cudnn_enum(cudnn_enum) + {} + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + bool is_reproducible() const override { + return m_is_reproducible; + } + + const char* name() const override { + return m_name; + } + + cudnnConvolutionBwdFilterAlgo_t cudnn_enum() const { + return m_cudnn_enum; + } + + bool is_cudnn() const override { + return true; + } +}; + +//! im2col and matmul, with dilation +class ConvolutionBackwardFilterImpl::AlgoMatmul final: public AlgoBase { + template + static void exec_internal(const ExecArgs &args); + + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "MATMUL"; + } + bool is_reproducible() const override { + return true; + } +}; + +class ConvolutionBackwardFilterImpl::AlgoChanwise final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "CHANNEL_WISE"; + } + bool is_reproducible() const override { + return true; + } +}; + +//! implement group conv by another algo +class ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral final: public AlgoBase { + AlgoBase *m_impl; + std::string m_name; + + public: + AlgoGroupConvGeneral(AlgoBase *impl); + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return m_name.c_str(); + } + + bool is_reproducible() const override { + return m_impl->is_reproducible(); + } + + static void modify_size_args(SizeArgs &args, + TensorLayout &src_pg, TensorLayout &diff_pg); +}; + +class ConvolutionBackwardFilterImpl::AlgoPack { + // defined in cudnn.cpp + void fill_cudnn_algos(); + + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator = (const AlgoPack &) = delete; + + public: + AlgoPack(); + + std::vector cudnn; + AlgoMatmul matmul; + AlgoChanwise chanwise; + std::vector gconv; + std::unordered_map algo2gconv; + + std::vector + //! all algorithms + all_algos, + //! non-cudnn algos, used for heuristic if cudnn is not supported + non_cudnn_algos; + + AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdFilterAlgo_t algo); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_filter/chanwise.cpp b/dnn/src/cuda/convolution/backward_filter/chanwise.cpp new file mode 100644 index 00000000..52f590b1 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_filter/chanwise.cpp @@ -0,0 +1,73 @@ +/** + * \file dnn/src/cuda/convolution/backward_filter/chanwise.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution/chanwise/kern.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool ConvolutionBackwardFilterImpl::AlgoChanwise::is_available( + const SizeArgs &args) const { + auto &&fm = args.grad_filter_meta; + return fm.format == Param::Format::NCHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + fm.spatial_ndim == 2 && fm.icpg == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && + !fm.should_flip; +} + +size_t ConvolutionBackwardFilterImpl::AlgoChanwise::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void ConvolutionBackwardFilterImpl::AlgoChanwise::exec( + const ExecArgs &args) const { + auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); + auto stream = cuda_stream(args.handle); + switch (args.diff_layout->dtype.enumv()) { + case DTypeEnum::Float32: + return chanwise::run_bwd_filter(args.grad_tensor->ptr(), + args.src_tensor->ptr(), + args.diff_tensor->ptr(), + kparam, stream); + case DTypeEnum::Float16: +#if CUDA_VERSION >= 9000 + if (is_compute_capability_required(5, 3)) { + return chanwise::run_bwd_filter( + static_cast<__half*>(args.grad_tensor->raw_ptr), + static_cast<__half*>(args.src_tensor->raw_ptr), + static_cast<__half*>(args.diff_tensor->raw_ptr), + kparam, stream); + } else { + return chanwise::run_bwd_filter( + args.grad_tensor->ptr(), + args.src_tensor->ptr(), + args.diff_tensor->ptr(), kparam, stream); + } +#else + return chanwise::run_bwd_filter(args.grad_tensor->ptr(), + args.src_tensor->ptr(), + args.diff_tensor->ptr(), + kparam, stream); +#endif + + default: + break; + } + megdnn_assert_internal(0); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution/backward_filter/cudnn.cpp b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp new file mode 100644 index 00000000..17b31934 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_filter/cudnn.cpp @@ -0,0 +1,114 @@ +/** + * \file dnn/src/cuda/convolution/backward_filter/cudnn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +#include "src/cuda/utils.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/convolution/helper.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available( + const SizeArgs &args) const { + CUDNNBwdFilterDescs D; + + if (!is_cudnn_supported(args.as_fwd_args())) + return false; + + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle->cudnn_handle(), + D.src_desc.desc, + D.diff_desc.desc, + D.conv_desc.desc, + D.grad_desc.desc, + m_cudnn_enum, + &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +size_t ConvolutionBackwardFilterImpl::AlgoCUDNN::get_workspace_in_bytes( + const SizeArgs &args) const { + CUDNNBwdFilterDescs D; + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle->cudnn_handle(), + D.src_desc.desc, + D.diff_desc.desc, + D.conv_desc.desc, + D.grad_desc.desc, + m_cudnn_enum, + &workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_filter get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + return workspace_size; +} + +void ConvolutionBackwardFilterImpl::AlgoCUDNN::exec( + const ExecArgs &args) const { + CUDNNBwdFilterDescs D; + args.init_desc(D); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionBackwardFilter(args.handle->cudnn_handle(), + &alpha, + D.src_desc.desc, args.src_tensor->raw_ptr, + D.diff_desc.desc, args.diff_tensor->raw_ptr, + D.conv_desc.desc, + m_cudnn_enum, + args.workspace.raw_ptr, + args.workspace.size, + &beta, + D.grad_desc.desc, + args.grad_tensor->raw_ptr); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_data failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); +} + +void ConvolutionBackwardFilterImpl::AlgoPack::fill_cudnn_algos() { +#define V1(v) #v +#define V(v) V1(v) + +#define DEF_ALGO(NAME, REPROD) \ + cudnn.push_back({ \ + REPROD, #NAME \ + "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) \ + "." V(CUDNN_PATCHLEVEL), \ + NAME}) + + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, true); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false); +#if CUDNN_MAJOR >= 6 || (CUDNN_MAJOR >= 5 && CUDNN_MINOR >= 1) + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, true); +#if CUDNN_MAJOR >= 6 + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, true); +#endif +#endif + +#if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) +#pragma message "not latest cudnn" +#endif + +#undef DEF_ALGO + +#undef V +#undef V1 +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_filter/group_conv.cpp b/dnn/src/cuda/convolution/backward_filter/group_conv.cpp new file mode 100644 index 00000000..164145fc --- /dev/null +++ b/dnn/src/cuda/convolution/backward_filter/group_conv.cpp @@ -0,0 +1,83 @@ +/** + * \file dnn/src/cuda/convolution/backward_filter/group_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::modify_size_args( + ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs &args, + TensorLayout &src_pg, TensorLayout &diff_pg) { + src_pg = *args.src_layout; + diff_pg = *args.diff_layout; + auto nr_grp = args.grad_filter_meta.group; + args.grad_filter_meta.group = 1; + src_pg.shape[1] /= nr_grp; + diff_pg.shape[1] /= nr_grp; + args.src_layout = &src_pg; + args.diff_layout = &diff_pg; +} + +ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( + AlgoBase *impl): + m_impl{impl} +{ + m_name = "group_conv:"; + m_name += impl->name(); +} + +bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available( + const SizeArgs &args) const { + auto sub_args = args; + TensorLayout src_pg, diff_pg; + modify_size_args(sub_args, src_pg, diff_pg); + return m_impl->is_available(sub_args); +} + +size_t ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral:: +get_workspace_in_bytes(const SizeArgs &args) const { + auto sub_args = args; + TensorLayout src_pg, diff_pg; + modify_size_args(sub_args, src_pg, diff_pg); + return m_impl->get_workspace_in_bytes(sub_args); +} + +void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec( + const ExecArgs &args) const { + auto sub_args = args; + TensorND tsrc{*args.src_tensor}, tdiff{*args.diff_tensor}, + tgrad{*args.grad_tensor}; + modify_size_args(sub_args, tsrc.layout, tdiff.layout); + sub_args.src_tensor = &tsrc; + sub_args.diff_tensor = &tdiff; + sub_args.grad_tensor = &tgrad; + + auto &&fm = args.grad_filter_meta; + auto grp = fm.group; + + auto strd_src = ( + tsrc.layout.stride[1] * fm.icpg * tsrc.layout.dtype.size()), + strd_diff = ( + tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), + strd_grad = (fm.icpg * fm.ocpg * + fm.spatial[0] * fm.spatial[1] * tgrad.layout.dtype.size()); + for (uint32_t g = 0; g < grp; ++ g) { + m_impl->exec(sub_args); + incr_voidp(tsrc.raw_ptr, strd_src); + incr_voidp(tdiff.raw_ptr, strd_diff); + incr_voidp(tgrad.raw_ptr, strd_grad); + } +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution/backward_filter/matmul.cpp b/dnn/src/cuda/convolution/backward_filter/matmul.cpp new file mode 100644 index 00000000..7d454534 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_filter/matmul.cpp @@ -0,0 +1,130 @@ +/** + * \file dnn/src/cuda/convolution/backward_filter/matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution/helper.h" +#include "src/cuda/convolution/im2col.cuh" + +using namespace megdnn; +using namespace cuda; + +bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available( + const SizeArgs &args) const { + auto &&fm = args.grad_filter_meta; + return fm.format == Param::Format::NCHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + fm.group == 1 && fm.spatial_ndim == 2; +} + +size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes( + const SizeArgs &args) const { + return matmul_get_workspace_bundle( + args.as_fwd_args()).total_size_in_bytes(); +} + +void ConvolutionBackwardFilterImpl::AlgoMatmul::exec( + const ExecArgs &args) const { +#define cb(DType) \ + if (args.diff_layout->dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_internal(args); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + + megdnn_assert_internal(0); +} + +template +void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal( + const ExecArgs &args) { + auto &&fm = args.grad_filter_meta; + size_t N = args.src_layout->shape[0], + IC = fm.icpg, + IH = args.src_layout->shape[2], + IW = args.src_layout->shape[3], + OC = fm.ocpg, + OH = args.diff_layout->shape[2], + OW = args.diff_layout->shape[3], + FH = fm.spatial[0], + FW = fm.spatial[1], + PH = fm.padding[0], + PW = fm.padding[1], + SH = fm.stride[0], + SW = fm.stride[1], + DH = fm.dilation[0], + DW = fm.dilation[1]; + auto stream = cuda_stream(args.handle); + auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); + wbundle.set(args.workspace.raw_ptr); + T *diff_t = static_cast(wbundle.get(0)); + T *col = static_cast(wbundle.get(1)); + { + // transpose diff + TensorLayout froml({N, OC*OH*OW}, typename DTypeTrait::dtype()), + tol(froml); + froml.stride[0] = args.diff_layout->stride[0]; + tol.stride[0] = 1; + tol.stride[1] = N; + TensorND from(args.diff_tensor->ptr(), froml), + to(diff_t, tol); + args.handle->relayout_opr()->exec(from, to); + } + { + // im2col + convolution::im2col(args.src_tensor->ptr(), col, + N, args.src_tensor->layout.stride[0], + IC, IH, IW, + FH, FW, + OH, OW, + PH, PW, + SH, SW, + DH, DW, + stream); + } + { + // take gemm grad + TensorLayout Al({OC, IC*FH*FW}, typename DTypeTrait::dtype()), + Bl({IC*FH*FW, OH*OW*N}, typename DTypeTrait::dtype()), + Cl({OC, OH*OW*N}, typename DTypeTrait::dtype()); + TensorND A(args.grad_tensor->ptr(), Al), + B(col, Bl), + C(diff_t, Cl); + if (fm.should_flip) { + A.raw_ptr = wbundle.get(2); + } + auto&& matmul_opr = args.handle->create_operator(); + if (args.opr->param().compute_mode == + param::Convolution::ComputeMode::FLOAT32) { + matmul_opr->param().compute_mode = + param::MatrixMul::ComputeMode::FLOAT32; + } + matmul_opr->param().transposeB = true; + megdnn_assert(matmul_opr->get_workspace_in_bytes(C.layout, B.layout, + A.layout) == 0_z, + "Assume matmul opr in algo MATMUL doesn't need extra " + "workspace"); + matmul_opr->exec(C, B, A, Workspace()); + + if (fm.should_flip) { + convolution::flip_filter( + args.as_fwd_args(), + {static_cast(args.grad_tensor->raw_ptr), + wbundle.get_size(2)}, + A.raw_ptr + ); + } + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/chanwise/bwd_data.cu b/dnn/src/cuda/convolution/chanwise/bwd_data.cu new file mode 100644 index 00000000..b0d345a5 --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/bwd_data.cu @@ -0,0 +1,526 @@ +/** + * \file dnn/src/cuda/convolution/chanwise/bwd_data.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" +#include "cuda_fp16.h" +#include "src/cuda/fp16_help.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; +using namespace chanwise; + +namespace { + +// grid idx is (inp_chl, worker_index) +// each y-slice of a block works on an (N, IH, IW) spatial image at given +// inp_chl +template +__global__ void kern_bwd_data_float(T* src_grad, const T* dst_grad, + const T* flt_tot, Param param) { + // extern __shared__ of dt_float16 does not work + extern __shared__ uint8_t flt_storage[]; + + T* const flt = reinterpret_cast(flt_storage); + + const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x, + IH = param.src_h, IW = param.src_w, + CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, + FH = FH_SET ? FH_SET : param.flt_h, + FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW, + PH = param.pad_h, PW = param.pad_w, + SH = SH_SET ? SH_SET : param.stride_h, + SW = SW_SET ? SW_SET : param.stride_w, OH = param.out_h, + OW = param.out_w, TOT_OUT = N * IH * IW; + + block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); + dst_grad += ic * CHL_MUL * OH * OW; + src_grad += ic * IH * IW; + + uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x, + nr_out_per_launch = blockDim.x * gridDim.y; + for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { + uint32_t out_idx = out_idx_, n, ih, iw; + out_idx = div_mod(out_idx, IW, iw); + out_idx = div_mod(out_idx, IH, ih); + n = out_idx; + + const T* dst_grad_base = dst_grad + n * (IC * CHL_MUL * OH * OW); + + T sum(0); + + // o >= max(0, floor_div((i+P-F+1), S)) + uint32_t ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH, + owmin = max(int32_t(iw + PW - FW + SW), 0) / SW, + ohmax = min((ih + PH) / SH, OH - 1), + owmax = min((iw + PW) / SW, OW - 1); + if (SH_SET == 1 && SW_SET == 1 && FH_SET && FW_SET) { +#pragma unroll + for (uint32_t doh = 0; doh < FH; ++doh) { + uint32_t oh = ohmin + doh; + if (oh <= ohmax) { + uint32_t fh = ih - oh * SH + PH; +#pragma unroll + for (uint32_t dow = 0; dow < FW; ++dow) { + uint32_t ow = owmin + dow; + if (ow <= owmax) { + uint32_t fw = iw - ow * SW + PW; + const T* pd = dst_grad_base + oh * OW + ow; + const T* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + sum += *pd * *pf; + pd += OH * OW; + pf += FSIZE; + } + } + } + } + } + } else { + for (uint32_t oh = ohmin; oh <= ohmax; ++oh) { + uint32_t fh = ih - oh * SH + PH; + for (uint32_t ow = owmin; ow <= owmax; ++ow) { + uint32_t fw = iw - ow * SW + PW; + const T* pd = dst_grad_base + oh * OW + ow; + const T* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; ++chl_mul) { + sum += *pd * *pf; + pd += OH * OW; + pf += FSIZE; + } + } + } + } + + src_grad[(n * (IC * IH) + ih) * IW + iw] = sum; + } +} + +#if CUDA_VERSION >= 9000 +template +__global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, + const __half* flt_tot, Param param) { + extern __shared__ uint8_t flt_storage[]; + + __half* const flt = reinterpret_cast<__half*>(flt_storage); + + const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x, + IH = param.src_h, IW = param.src_w, + CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, + FH = FH_SET ? FH_SET : param.flt_h, + FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW, + PH = param.pad_h, PW = param.pad_w, + SH = SH_SET ? SH_SET : param.stride_h, + SW = SW_SET ? SW_SET : param.stride_w, OH = param.out_h, + OW = param.out_w, TOT_OUT = N * IH * IW; + + block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); + dst_grad += ic * CHL_MUL * OH * OW; + src_grad += ic * IH * IW; + + uint32_t out_idx_ = (blockIdx.y * blockDim.x + threadIdx.x) * 2, + nr_out_per_launch = (blockDim.x * gridDim.y) * 2; + for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { + if (out_idx_ % IW < IW - 1) { + uint32_t out_idx = out_idx_, n, ih, iw; + out_idx = div_mod(out_idx, IW, iw); + out_idx = div_mod(out_idx, IH, ih); + n = out_idx; + + const __half* dst_grad_base = + dst_grad + n * (IC * CHL_MUL * OH * OW); + + __half2 sum{0.0, 0.0}; + __half2 pd2{0.0, 0.0}; + __half2 pf2{0.0, 0.0}; + + uint32_t ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH, + owmin_x = max(int32_t(iw + PW - FW + SW), 0) / SW, + owmin_y = max(int32_t(iw + 1 + PW - FW + SW), 0) / SW, + ohmax = min((ih + PH) / SH, OH - 1), + owmax_x = min((iw + PW) / SW, OW - 1), + owmax_y = min((iw + 1 + PW) / SW, OW - 1); + if (SH_SET == 1 && SW_SET == 1 && FH_SET && FW_SET) { +#pragma unroll + for (uint32_t doh = 0; doh < FH; ++doh) { + uint32_t oh = ohmin + doh; + if (oh <= ohmax) { + uint32_t fh = ih - oh + PH; + uint32_t owmin = owmin_x, owmax = owmax_y; + + const __half* pd = dst_grad_base + oh * OW; + const __half* pf = flt + fh * FW; + + if (FW == 3) { +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + __half2 flt0 = {0.0, *(pf)}, + flt1 = {*(pf), *(pf + 1)}, + flt2 = {*(pf + 1), *(pf + 2)}, + flt3 = {*(pf + 2), 0.0}; + uint32_t ow = owmin; + uint32_t fw = iw - ow + PW; + __half2 dst2 = {0.0, 0.0}; + if (static_cast(ow) < + static_cast(owmin_y)) { + dst2 = {*(pd + ow), 0.0}; + sum = fma2(dst2, flt3, sum); + ++ow; + --fw; + } + if (static_cast(owmax_x) < + static_cast(owmax)) { + dst2 = {0.0, *(pd + owmax)}; + sum = fma2(dst2, flt0, sum); + } + if (static_cast(fw) == 1) { + dst2 = {*(pd + ow), *(pd + ow)}; + sum = fma2(dst2, flt2, sum); + ++ow; + --fw; + } + if (static_cast(ow) <= + static_cast(owmax_x)) { + dst2 = {*(pd + ow), *(pd + ow)}; + sum = fma2(dst2, flt1, sum); + } + + pd += OH * OW; + pf += FSIZE; + } + } else if (FW == 5) { +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + __half2 flt0 = {0.0, *(pf)}, + flt1 = {*(pf), *(pf + 1)}, + flt2 = {*(pf + 1), *(pf + 2)}, + flt3 = {*(pf + 2), *(pf + 3)}, + flt4 = {*(pf + 3), *(pf + 4)}, + flt5 = {*(pf + 4), 0.0}; + uint32_t ow = owmin; + uint32_t fw = iw - ow + PW; + __half2 dst2 = {0.0, 0.0}; + if (static_cast(ow) < + static_cast(owmin_y)) { + dst2 = {*(pd + ow), 0.0}; + sum = fma2(dst2, flt5, sum); + ++ow; + --fw; + } + if (static_cast(owmax_x) < + static_cast(owmax)) { + dst2 = {0.0, *(pd + owmax)}; + sum = fma2(dst2, flt0, sum); + } + if (static_cast(fw) == 3) { + dst2 = {*(pd + ow), *(pd + ow)}; + sum = fma2(dst2, flt4, sum); + ++ow; + --fw; + } + if (static_cast(fw) == 2 && + static_cast(ow) <= + static_cast(owmax_x)) { + dst2 = {*(pd + ow), *(pd + ow)}; + sum = fma2(dst2, flt3, sum); + ++ow; + --fw; + } + if (static_cast(fw) == 1 && + static_cast(ow) <= + static_cast(owmax_x)) { + dst2 = {*(pd + ow), *(pd + ow)}; + sum = fma2(dst2, flt2, sum); + ++ow; + --fw; + } + if (static_cast(fw) == 0 && + static_cast(ow) <= + static_cast(owmax_x)) { + dst2 = {*(pd + ow), *(pd + ow)}; + sum = fma2(dst2, flt1, sum); + } + + pd += OH * OW; + pf += FSIZE; + } + } else { +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { +#pragma unroll + for (uint32_t dow = 0; dow <= FW; ++dow) { + uint32_t ow = owmin + dow; + uint32_t fw = iw - ow + PW; + if (static_cast(ow) <= + static_cast(owmax)) { + pd2 = {*(pd + ow), *(pd + ow)}; + pf2 = {0.0, 0.0}; + if (static_cast(ow) >= + static_cast(owmin_y)) + pf2.y = *(pf + fw + 1); + if (static_cast(ow) <= + static_cast(owmax_x)) + pf2.x = *(pf + fw); + sum = fma2(pd2, pf2, sum); + } + } + pd += OH * OW; + pf += FSIZE; + } + } + } + } + } else { +#pragma unroll + for (uint32_t oh = ohmin; oh <= ohmax; ++oh) { + uint32_t fh = ih - oh * SH + PH; + + if (owmin_x < owmin_y) { + uint32_t fw = iw - owmin_x * SW + PW; + const __half* pd = dst_grad_base + oh * OW + owmin_x; + const __half* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + pd2.x = *pd; + pd2.y = 0.0; + pf2.x = *pf; + pf2.y = 0.0; + sum = fma2(pd2, pf2, sum); + pd += OH * OW; + pf += FSIZE; + } + } + + if (owmax_x < owmax_y) { + uint32_t fw = iw + 1 - owmax_y * SW + PW; + const __half* pd = dst_grad_base + oh * OW + owmax_y; + const __half* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + pd2.x = 0.0; + pd2.y = *pd; + pf2.x = 0.0; + pf2.y = *pf; + sum = fma2(pd2, pf2, sum); + pd += OH * OW; + pf += FSIZE; + } + } + + uint32_t ow = owmin_y; + uint32_t owmax = owmax_x; +#pragma unroll + for (; ow <= owmax; ++ow) { + uint32_t fw = iw - ow * SW + PW; + const __half* pd = dst_grad_base + oh * OW + ow; + const __half* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + pd2.x = *pd; + pd2.y = *pd; + pf2.x = *pf; + pf2.y = *(pf + 1); + sum = fma2(pd2, pf2, sum); + pd += OW * OH; + pf += FSIZE; + } + } + } + } + + src_grad[(n * (IC * IH) + ih) * IW + iw] = sum.x; + src_grad[(n * (IC * IH) + ih) * IW + iw + 1] = sum.y; + } else { + size_t offset = 0; + + for (offset = 0; offset < 2; ++offset) { + uint32_t out_idx = out_idx_ + offset, n, ih, iw; + out_idx = div_mod(out_idx, IW, iw); + out_idx = div_mod(out_idx, IH, ih); + n = out_idx; + + const __half* dst_grad_base = + dst_grad + n * (IC * CHL_MUL * OH * OW); + + __half sum(0); + + uint32_t ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH, + owmin = max(int32_t(iw + PW - FW + SW), 0) / SW, + ohmax = min((ih + PH) / SH, OH - 1), + owmax = min((iw + PW) / SW, OW - 1); + if (SH_SET == 1 && SW_SET == 1 && FH_SET && FW_SET) { +#pragma unroll + for (uint32_t doh = 0; doh < FH; ++doh) { + uint32_t oh = ohmin + doh; + if (oh <= ohmax) { + uint32_t fh = ih - oh * SH + PH; +#pragma unroll + for (uint32_t dow = 0; dow < FW; ++dow) { + uint32_t ow = owmin + dow; + if (ow <= owmax) { + uint32_t fw = iw - ow * SW + PW; + const __half* pd = + dst_grad_base + oh * OW + ow; + const __half* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; + chl_mul < CHL_MUL; ++chl_mul) { + sum = fma(*pd, *pf, sum); + pd += OH * OW; + pf += FSIZE; + } + } + } + } + } + } else { +#pragma unroll + for (uint32_t oh = ohmin; oh <= ohmax; ++oh) { + uint32_t fh = ih - oh * SH + PH; +#pragma unroll + for (uint32_t ow = owmin; ow <= owmax; ++ow) { + uint32_t fw = iw - ow * SW + PW; + const __half* pd = dst_grad_base + oh * OW + ow; + const __half* pf = flt + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++chl_mul) { + sum = fma(*pd, *pf, sum); + pd += OH * OW; + pf += FSIZE; + } + } + } + } + + src_grad[(n * (IC * IH) + ih) * IW + iw] = sum; + + if (ih == IH - 1 && iw == IW - 1 && n == N - 1) + break; + } + } + } +} +#endif + +#define sh param.stride_h +#define sw param.stride_w +#define SET_STRIDE(func, type, chl_mul, fh, fw) \ + if (sh == 1 && sw == 1) { \ + kern_ptr = func; \ + } else if (sh == 2 && sw == 2) { \ + kern_ptr = func; \ + } else { \ + kern_ptr = func; \ + } + +#define GET_KERN(func, type) \ + if (param.chl_mul == 1) { \ + if (param.flt_h == 3 && param.flt_w == 3) { \ + SET_STRIDE(func, type, 1, 3, 3); \ + } else if (param.flt_h == 5 && param.flt_w == 5) { \ + SET_STRIDE(func, type, 1, 5, 5); \ + } else if (param.flt_h == 7 && param.flt_w == 7) { \ + SET_STRIDE(func, type, 1, 7, 7); \ + } else { \ + SET_STRIDE(func, type, 0, 0, 0); \ + } \ + } else { \ + SET_STRIDE(func, type, 0, 0, 0); \ + } + +template +void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); + +template <> +void (*get_kern(const Param& param))(float*, const float*, const float*, + const Param) { + void (*kern_ptr)(float*, const float*, const float*, Param); + GET_KERN(kern_bwd_data_float, float); + return kern_ptr; +} + +#if CUDA_VERSION >= 9000 +template <> +void (*get_kern<__half>(const Param& param))(__half*, const __half*, + const __half*, const Param) { + void (*kern_ptr)(__half*, const __half*, const __half*, Param); + GET_KERN(kern_bwd_data_hf, __half); + return kern_ptr; +} +#endif + +template <> +void (*get_kern(const Param& param))(dt_float16*, const dt_float16*, + const dt_float16*, + const Param) { + void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param); + GET_KERN(kern_bwd_data_float, dt_float16); + return kern_ptr; +} + +#undef sh +#undef sw +#undef SET_STRIDE +#undef GET_KERN +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace convolution { +namespace chanwise { + +template +void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, + const Param& param, cudaStream_t stream) { + void (*kern)(T*, const T*, const T*, Param); + kern = get_kern(param); + + int nr_thread = query_blocksize_for_kernel(kern), + nr_out_dimx = param.src_h * param.src_w * param.batch; + dim3 nr_block(param.src_chl, + std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); + uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); + kern<<>>(src_grad, dst_grad, flt, + param); + after_kernel_launch(); +} + +template void run_bwd_data(float*, const float*, const float*, const Param&, + cudaStream_t); + +#if CUDA_VERSION >= 9000 +template void run_bwd_data(__half*, const __half*, const __half*, const Param&, + cudaStream_t); +#endif + +template void run_bwd_data(dt_float16*, const dt_float16*, const dt_float16*, + const Param&, cudaStream_t); + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution/chanwise/bwd_filter.cu b/dnn/src/cuda/convolution/chanwise/bwd_filter.cu new file mode 100644 index 00000000..6a317b86 --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/bwd_filter.cu @@ -0,0 +1,452 @@ +/** + * \file dnn/src/cuda/convolution/chanwise/bwd_filter.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" +#include "src/cuda/cub/util_ptx.cuh" +#include "cuda_fp16.h" +#include "src/cuda/fp16_help.cuh" + +const uint32_t WARP_SIZE = 32, BATCH_UNROLL = 4; + +using namespace megdnn; +using namespace cuda; +using namespace convolution; +using namespace chanwise; + +namespace { + +/*! + * \brief compute grad w.r.t. filter + * + * block dim: out_id * kern_id + * threads with the same out_id computes grad for corresponding kernel element + * \tparam nr_thpf number of threads for one element in the filter; must be + * power of 2; + */ +template +__global__ void kern_bwd_filter_float( + T* flt_grad, const T* src, const T* dst_grad, Param param) { + + const uint32_t + N = param.batch, IC = param.src_chl, IH = param.src_h, IW = param.src_w, + CHL_MUL = param.chl_mul, + FH = param.flt_h, FW = param.flt_w, + PH = param.pad_h, PW = param.pad_w, + SH = param.stride_h, SW = param.stride_w, + OH = param.out_h, OW = param.out_w, + SRC_BATCH_STRIDE = IC * IH * IW, + DST_BATCH_STRIDE = IC * CHL_MUL * OH * OW, + BLKDIM_X = blockDim.x / nr_thpf, + THREADID_X = threadIdx.x / nr_thpf, + OUT_IDX = blockIdx.x * BLKDIM_X + THREADID_X; + + uint32_t ic, chl_mul, fh, fw; + { + uint32_t i = OUT_IDX; + i = div_mod(i, FW, fw); + i = div_mod(i, FH, fh); + i = div_mod(i, CHL_MUL, chl_mul); + ic = i; + } + if (ic >= IC) { + return; + } + src += ic * IH * IW; + dst_grad += (ic * CHL_MUL + chl_mul) * OH * OW; + + const uint32_t + oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH, + oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH), + ow_lo = max(int32_t(PW - fw + SW - 1), 0) / SW, + ow_hi = min((IW - 1 + PW - fw) / SW + 1, OW), + oblk_h = oh_hi - oh_lo, + oblk_w = ow_hi - ow_lo, + oblk_tot = oblk_h * oblk_w * ((N + BATCH_UNROLL - 1) / BATCH_UNROLL), + tid = threadIdx.x % nr_thpf; + + if (IH + PH < fh + 1 || oh_lo >= oh_hi || + IW + PW < fw + 1 || ow_lo >= ow_hi) { + if (!tid) + flt_grad[OUT_IDX] = 0; + return; + } + + T sum(0); + for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) { + uint32_t n, oh, ow; + n = div_mod(div_mod(oblk_idx, oblk_w, ow), oblk_h, oh) * BATCH_UNROLL; + oh += oh_lo; + ow += ow_lo; + uint32_t ih = oh * SH - PH + fh, + iw = ow * SW - PW + fw, + soff = ih * IW + iw + n * SRC_BATCH_STRIDE, + doff = oh * OW + ow + n * DST_BATCH_STRIDE; +#pragma unroll + for (uint32_t i = 0; i < BATCH_UNROLL; ++ i) { + if (!i || n + i < N) { + sum += src[soff] * dst_grad[doff]; + } + soff += SRC_BATCH_STRIDE; + doff += DST_BATCH_STRIDE; + } + } + + if (nr_thpf == 1) { + flt_grad[OUT_IDX] = sum; + } else { + // reduce all sums in a block + extern __shared__ uint8_t shared_storage[]; + volatile T* thread_sum = reinterpret_cast(shared_storage); + thread_sum += THREADID_X * nr_thpf; + thread_sum[tid] = sum; +#pragma unroll + for (uint32_t i = nr_thpf / 2; i; i >>= 1) { + bool cond = nr_thpf >= i * 2 && tid < i; + if (i >= WARP_SIZE) { + __syncthreads(); + } else { + cub::WARP_SYNC(0xffffffff); + } + if (cond) { + T v0 = thread_sum[tid], v1 = v0 + thread_sum[tid + i]; + thread_sum[tid] = v1; + } + } + + if (!tid) { + flt_grad[OUT_IDX] = thread_sum[0]; + } + } +} + +#if CUDA_VERSION >= 9000 +template +__global__ void kern_bwd_filter_hf( + __half* flt_grad, const __half* src, const __half* dst_grad, Param param) { + const uint32_t + N = param.batch, IC = param.src_chl, IH = param.src_h, IW = param.src_w, + CHL_MUL = param.chl_mul, + FH = param.flt_h, FW = param.flt_w, + PH = param.pad_h, PW = param.pad_w, + SH = param.stride_h, SW = param.stride_w, + OH = param.out_h, OW = param.out_w, + SRC_BATCH_STRIDE = IC * IH * IW, + DST_BATCH_STRIDE = IC * CHL_MUL * OH * OW, + BLKDIM_X = (blockDim.x / nr_thpf) * 2, + THREADID_X = (threadIdx.x / nr_thpf) * 2, + OUT_IDX = blockIdx.x * BLKDIM_X + THREADID_X, + LAST_IDX = FH * FW * CHL_MUL * IC, + tid = threadIdx.x % nr_thpf; + __half2 sum2{0.0, 0.0}; + + if (OUT_IDX % FW != FW - 1) { + uint32_t ic, chl_mul, fh, fw; + { + uint32_t i = OUT_IDX; + i = div_mod(i, FW, fw); + i = div_mod(i, FH, fh); + i = div_mod(i, CHL_MUL, chl_mul); + ic = i; + } + if (ic >= IC) { + return; + } + src += ic * IH * IW; + dst_grad += (ic * CHL_MUL + chl_mul) * OH * OW; + + const uint32_t + oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH, + oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH), + ow_lox = max(int32_t(PW - fw + SW - 1), 0) / SW, + ow_loy = max(int32_t(PW - fw + SW - 2), 0) / SW, + ow_hix = min((IW - 1 + PW - fw) / SW + 1, OW), + ow_hiy = min((IW - 2 + PW - fw) / SW + 1, OW), + oblk_h = oh_hi - oh_lo, + oblk_wx = ow_hix - ow_lox, + oblk_wy = ow_hiy - ow_loy; + if (IH + PH < fh + 1 || oh_lo >= oh_hi || IW + PW < fw + 1) { + if (!tid) { + flt_grad[OUT_IDX] = 0; + flt_grad[OUT_IDX + 1] = 0; + } + return; + } + + if (ow_lox >= ow_hix) { + if (!tid) + flt_grad[OUT_IDX] = 0; + } + + if (IW + PW < fw + 2 || ow_loy >= ow_hiy) { + if (!tid) + flt_grad[OUT_IDX + 1] = 0; + if (ow_lox >= ow_hix) + return; + } + + sum2 = {0.0, 0.0}; + __half2 src2{0.0, 0.0}; + __half2 dst2{0.0, 0.0}; + + const uint32_t + oblk_w = max(ow_hix, ow_hiy) - min(ow_lox, ow_loy), + oblk_tot = oblk_h * oblk_w * ((N + BATCH_UNROLL - 1) / BATCH_UNROLL); + + for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) { + uint32_t n_x, n_y, oh, ow_x, ow_y; + n_x = div_mod(div_mod(oblk_idx, oblk_wx, ow_x), oblk_h, oh) * BATCH_UNROLL; + n_y = div_mod(div_mod(oblk_idx, oblk_wy, ow_y), oblk_h, oh) * BATCH_UNROLL; + oh += oh_lo; + ow_x += ow_lox; + ow_y += ow_loy; + uint32_t ih = oh * SH - PH + fh, + iw_x = ow_x * SW - PW + fw, + iw_y = ow_y * SW - PW + fw + 1, + soff_x = ih * IW + iw_x + n_x * SRC_BATCH_STRIDE, + soff_y = ih * IW + iw_y + n_y * SRC_BATCH_STRIDE, + doff_x = oh * OW + ow_x + n_x * DST_BATCH_STRIDE, + doff_y = oh * OW + ow_y + n_y * DST_BATCH_STRIDE; +#pragma unroll + for (uint32_t i = 0; i < BATCH_UNROLL; ++ i) { + if (!i || n_x + i < N || n_y + i < N) { + src2.x = 0.0; + src2.y = 0.0; + dst2.x = 0.0; + dst2.y = 0.0; + if (n_x + i < N && ow_x < ow_hix) { + src2.x = src[soff_x]; + dst2.x = dst_grad[doff_x]; + } + if (n_y + i < N && ow_y < ow_hiy) { + src2.y = src[soff_y]; + dst2.y = dst_grad[doff_y]; + } + sum2 = fma2(src2, dst2, sum2); + } + soff_x += SRC_BATCH_STRIDE; + soff_y += SRC_BATCH_STRIDE; + doff_x += DST_BATCH_STRIDE; + doff_y += DST_BATCH_STRIDE; + } + } + } else { + for (size_t offset = 0; offset < 2; ++ offset) { + uint32_t ic, chl_mul, fh, fw; + { + uint32_t i = OUT_IDX + offset; + i = div_mod(i, FW, fw); + i = div_mod(i, FH, fh); + i = div_mod(i, CHL_MUL, chl_mul); + ic = i; + } + if (ic >= IC) { + if (offset == 0) + return; + else + break; + } + const uint32_t + oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH, + oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH), + ow_lo = max(int32_t(PW - fw + SW - 1), 0) / SW, + ow_hi = min((IW - 1 + PW - fw) / SW + 1, OW), + oblk_h = oh_hi - oh_lo, + oblk_w = ow_hi - ow_lo, + oblk_tot = oblk_h * oblk_w * ((N + BATCH_UNROLL - 1) / BATCH_UNROLL); + + if (IH + PH < fh + 1 || oh_lo >= oh_hi || + IW + PW < fw + 1 || ow_lo >= ow_hi) { + if (!tid) + flt_grad[OUT_IDX + offset] = 0; + continue; + } + + __half sum(0.0); + + for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) { + uint32_t n, oh, ow; + n = div_mod(div_mod(oblk_idx, oblk_w, ow), oblk_h, oh) * BATCH_UNROLL; + oh += oh_lo; + ow += ow_lo; + uint32_t ih = oh * SH - PH + fh, + iw = ow * SW - PW + fw, + soff = ic * IH * IW + ih * IW + iw + n * SRC_BATCH_STRIDE, + doff = (ic * CHL_MUL + chl_mul) * OH * OW + oh * OW + ow + n * DST_BATCH_STRIDE; +#pragma unroll + for (uint32_t i = 0; i < BATCH_UNROLL; ++ i) { + if (!i || n + i < N) { + sum = fma(src[soff], dst_grad[doff], sum); + } + soff += SRC_BATCH_STRIDE; + doff += DST_BATCH_STRIDE; + } + } + if (!offset) + sum2.x = sum; + if (offset) + sum2.y = sum; + } + } + + if (nr_thpf == 1) { + flt_grad[OUT_IDX] = sum2.x; + if (OUT_IDX != LAST_IDX) + flt_grad[OUT_IDX + 1] = sum2.y; + } else { + extern __shared__ uint8_t shared_storage[]; + __half2* thread_sum = reinterpret_cast<__half2*>(shared_storage); + thread_sum += THREADID_X * nr_thpf / 2; + thread_sum[tid] = sum2; +#pragma unroll + for (uint32_t i = nr_thpf / 2; i; i >>= 1) { + bool cond = nr_thpf >= i * 2 && tid < i; + if (i >= WARP_SIZE) { + __syncthreads(); + } else { + cub::WARP_SYNC(0xffffffff); + } + if (cond) { + __half2 one = {1.0, 1.0}; + __half2 v0 = thread_sum[tid], v1 = fma2(v0, one, thread_sum[tid + i]); + thread_sum[tid] = v1; + } + } + + if (!tid) { + flt_grad[OUT_IDX] = thread_sum[0].x; + if (OUT_IDX != LAST_IDX) + flt_grad[OUT_IDX + 1] = thread_sum[0].y; + } + } +} +#endif + +#define GET_KERN(func, type) \ + switch(_p) { \ + case 1<<10: kern_ptr = func; break; \ + case 1<<9: kern_ptr = func; break; \ + case 1<<8: kern_ptr = func; break; \ + case 1<<7: kern_ptr = func; break; \ + case 1<<6: kern_ptr = func; break; \ + case 1<<5: kern_ptr = func; break; \ + case 1<<4: kern_ptr = func; break; \ + case 1<<3: kern_ptr = func; break; \ + case 1<<2: kern_ptr = func; break; \ + case 1<<1: kern_ptr = func; break; \ + case 1<<0: kern_ptr = func; break; \ + } + +template +void (*get_kern(const uint32_t& _p))(T*, const T*, const T*, Param); + +template <> +void (*get_kern(const uint32_t& _p))(float*, const float*, const float*, Param) { + void (*kern_ptr)(float*, const float*, const float*, Param) = NULL; + GET_KERN(kern_bwd_filter_float, float); + return kern_ptr; +} + +#if CUDA_VERSION >= 9000 +template <> +void (*get_kern<__half>(const uint32_t& _p))(__half*, const __half*, const __half*, Param) { + void (*kern_ptr)(__half*, const __half*, const __half*, Param) = NULL; + GET_KERN(kern_bwd_filter_hf, __half); + return kern_ptr; +} +#endif + +template <> +void (*get_kern(const uint32_t& _p))(dt_float16*, const dt_float16*, + const dt_float16*, Param) { + void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param) = NULL; + GET_KERN(kern_bwd_filter_float, dt_float16); + return kern_ptr; +} + +#undef GET_KERN +} // anonymous namespace + + +namespace megdnn { +namespace cuda { +namespace convolution { +namespace chanwise { +template +void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, + const Param ¶m, cudaStream_t stream) { + void (*kern)(T*, const T*, const T*, Param) = NULL; + uint32_t + nr_thread = query_blocksize_for_kernel(get_kern(1024)), + nr_thpf = std::min(nr_thread, + std::max( + 1, + param.out_h * param.out_w * param.batch / + (BATCH_UNROLL * 16))); + // find nearest power-of-2 of nr_thpf + do { +#define CK(_n) \ + if (nr_thpf >= _n) { \ + kern = get_kern(_n); \ + nr_thpf = _n; \ + break; \ + } + CK(1<<10); + CK(1<<9); + CK(1<<8); + CK(1<<7); + CK(1<<6); + CK(1<<5); + CK(1<<4); + CK(1<<3); + CK(1<<2); + CK(1<<1); + CK(1<<0); +#undef CK + } while(0); + + megdnn_assert(kern); + nr_thread = query_blocksize_for_kernel(kern); + + uint32_t nr_flt_per_blk = nr_thread / nr_thpf; + while (nr_flt_per_blk * nr_thpf % WARP_SIZE) + --nr_flt_per_blk; + megdnn_assert(nr_flt_per_blk); + + int nr_block = DIVUP( + param.flt_h * param.flt_w * param.src_chl * param.chl_mul, + nr_flt_per_blk); + nr_thread = nr_flt_per_blk * nr_thpf; + uint32_t shared = nr_thread * 2 * sizeof(T); + kern <<< nr_block, nr_thread, shared, stream >>> ( + filter_grad, src, dst_grad, param); + after_kernel_launch(); +} + +template void run_bwd_filter(float*, const float*, const float*, const Param&, + cudaStream_t); + +#if CUDA_VERSION >= 9000 +template void run_bwd_filter(__half*, const __half*, const __half*, const Param&, + cudaStream_t); +#endif + +template void run_bwd_filter(dt_float16*, const dt_float16*, const dt_float16*, + const Param&, cudaStream_t); + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution/chanwise/bwd_small.cu b/dnn/src/cuda/convolution/chanwise/bwd_small.cu new file mode 100644 index 00000000..3713f42f --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/bwd_small.cu @@ -0,0 +1,318 @@ +/** + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * \file dnn/src/cuda/convolution/chanwise/bwd_small.cu + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * -------------------------------------------------------------------------- + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" +#include "cuda.h" +#include "cuda_fp16.h" +#include "src/cuda/convolution/chanwise/launch_config.cuh" +#include "src/cuda/fp16_help.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; +using namespace chanwise; + +namespace { + +enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD }; + +// CUDA kernel to compute the depthwise convolution forward pass in NCHW format, +// tailored for small images up to 32x32. Stride and depth multiplier must be 1. +// Padding must be 'SAME', which allows to reuse the index computation. Only +// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true. +// Tiles of the input and filter tensors are loaded into shared memory before +// performing the convolution. Each thread handles two elements per iteration, +// one each in the lower and upper half of a tile. +// Backprop input direction is the same as forward direction with the filter +// rotated by 180°. +template +__global__ void +#if __CUDA_ARCH__ >= 750 +__launch_bounds__(1024, 1) +#else +__launch_bounds__(1024, 2) +#endif + DepthwiseConv2dGPUKernelNCHWSmall(const Param param, const T* input, + const T* filter, T* output) { + // Holds block plus halo and filter data for blockDim.z depths. + extern __shared__ __align__(8) unsigned char shared_memory[]; + static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); + T* const shared_data = reinterpret_cast(shared_memory); + + const int num_batches = static_cast(param.batch); + const int in_height = static_cast(param.src_h); + const int in_width = static_cast(param.src_w); + const int in_depth = static_cast(param.src_chl); + const int filter_height = kKnownFilterHeight < 0 + ? static_cast(param.flt_h) + : kKnownFilterHeight; + const int filter_width = kKnownFilterWidth < 0 + ? static_cast(param.flt_w) + : kKnownFilterWidth; + const int pad_height = static_cast(param.pad_h); + const int pad_width = static_cast(param.pad_w); + + // Fixed blockDim.z, tailored for maximum grid size for images of size + // 16x16. assert(blockDim.x == param.src_w); assert(blockDim.z == + // kBlockDepth); + const int block_height = blockDim.y; + + // These values are the same for all threads and could + // be precomputed on the CPU. + const int block_pixels = in_width * block_height; + const int block_size = block_pixels * kBlockDepth; + const int in_pixels = in_width * in_height; + const int in_increment = in_width - 1; + const int filter_pixels = filter_height * filter_width; + const int tile_width = in_width + filter_width - 1; + const int even_height = kKnownEvenHeight || (1 & ~in_height); + const int tile_height = in_height + filter_height - even_height; + const int tile_pixels = tile_width * tile_height; + const int tile_size = tile_pixels * kBlockDepth; + const int tile_offset = block_height * tile_width; + const int pad_offset = pad_height * tile_width + pad_width; + const int in_total_depth = in_depth * num_batches; + const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth; + + const int thread_col = threadIdx.x; + const int thread_row = threadIdx.y; + const int thread_depth = threadIdx.z; + + // Position in block. + const int thread_pix = thread_row * in_width + thread_col; + const int thread_idx = thread_depth * block_pixels + thread_pix; + + // Initialize tile, in particular the padding. + for (int i = thread_idx; i < tile_size; i += block_size) { + shared_data[i] = T(); + } + __syncthreads(); + + // Position in tensors. + const int tensor_idx = thread_depth * in_pixels + thread_pix; + + // Position in (padded) shared memory. + const int data_pix = thread_row * tile_width + thread_col; + const int data_idx = thread_depth * tile_pixels + data_pix; + + // Position in shared memory, offset by pad_height / pad_width. + const int tile_idx = data_idx + pad_offset; + + // Filter is always in HWCK format, irrespective of the input/output format. + const int filter_pix = thread_idx / kBlockDepth; + const int filter_channel = thread_idx % kBlockDepth; + + const int max_channel = in_total_depth - thread_depth; + const int filter_write_offset = + filter_pix < filter_pixels ? tile_size + thread_idx : 0; + const int filter_read_offset = + tile_size + thread_depth + + (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth); + const bool skip_second = + !kKnownEvenHeight && thread_row + (in_height & 1) == block_height; + + for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { + const int channel = b * kBlockDepth; + + const int inout_offset = channel * in_pixels + tensor_idx; + const bool channel_in_range = channel < max_channel; + + if (channel_in_range) { + const T* const in_ptr = inout_offset + input; + T* const tile_ptr = tile_idx + shared_data; + tile_ptr[0] = *in_ptr; + if (!skip_second) { + tile_ptr[tile_offset] = *(block_pixels + in_ptr); + } + } + + if (filter_write_offset != 0) { + const int filter_offset = + (channel + filter_channel) % in_depth * filter_pixels + + filter_pix; + shared_data[filter_write_offset] = *(filter_offset + filter); + } + + // Note: the condition to reach this is uniform across the entire block. + __syncthreads(); + + if (channel_in_range) { + T2 sum = {0.0, 0.0}; + int shared_offset = data_idx; + const T* filter_ptr = filter_read_offset + shared_data; +#pragma unroll + for (int r = 0; r < filter_height; ++r) { +#pragma unroll + for (int c = 0; c < filter_width; ++c) { + if (kDirection == DIRECTION_BACKWARD) { + filter_ptr -= kBlockDepth; + } + const T2 filter_value = {*filter_ptr, *filter_ptr}; + const T* const tile_ptr = shared_offset + shared_data; + const T2 tile_value = {tile_ptr[0], tile_ptr[tile_offset]}; + sum = fma2(filter_value, tile_value, sum); + ++shared_offset; + if (kDirection == DIRECTION_FORWARD) { + filter_ptr += kBlockDepth; + } + } + shared_offset += in_increment; + } + T* const out_ptr = inout_offset + output; + out_ptr[0] = static_cast(sum.x); + if (!skip_second) { + out_ptr[block_pixels] = static_cast(sum.y); + } + } + + // Note: the condition to reach this is uniform across the entire block. + __syncthreads(); + } +} + +template +void LaunchDepthwiseConv2dGPUSmall(const Param& param, const T* input, + const T* filter, T* output, + cudaStream_t stream) { + const int block_height = (param.src_h + 1) / 2; + dim3 block_dim; + int block_count; + void (*kernel)(const Param, const T*, const T*, T*); + block_dim = dim3(param.src_w, block_height, kBlockDepth); + block_count = + DIVUP(param.batch * param.src_chl * param.chl_mul, kBlockDepth) * + kBlockDepth; + kernel = DepthwiseConv2dGPUKernelNCHWSmall< + T, T2, kDirection, kKnownFilterWidth, kKnownFilterHeight, + kBlockDepth, kKnownEvenHeight>; + const int tile_width = param.src_w + param.flt_w - 1; + const int tile_height = block_height * 2 + param.flt_h - 1; + const int tile_pixels = tile_height * tile_width; + const int filter_pixels = param.flt_h * param.flt_w; + const int shared_memory_size = + kBlockDepth * (tile_pixels + filter_pixels) * sizeof(T); + const int num_outputs = param.out_h * param.out_w * block_count; + + block_count = GetFixedBlockSize(num_outputs, kernel, shared_memory_size, + block_dim.x * block_dim.y * block_dim.z); + kernel<<>>( + param, input, filter, output); + after_kernel_launch(); +} + +template +void LaunchDepthwiseConv2dGPUSmall(const Param& param, const T* input, + const T* filter, T* output, + cudaStream_t stream) { + if (param.src_h & 1) { + return LaunchDepthwiseConv2dGPUSmall< + T, T2, kDirection, kKnownFilterWidth, kKnownFilterHeight, + kBlockDepth, false>(param, input, filter, output, stream); + } else { + return LaunchDepthwiseConv2dGPUSmall< + T, T2, kDirection, kKnownFilterWidth, kKnownFilterHeight, + kBlockDepth, true>(param, input, filter, output, stream); + } +} + +template +void LaunchDepthwiseConv2dGPUSmall(const Param& param, const T* input, + const T* filter, T* output, + cudaStream_t stream) { + // Maximize (power of two) kBlockDepth while keeping a block within 1024 + // threads (2 pixels per thread). + const int block_pixels = (param.src_h + 1) / 2 * param.src_w; + if (block_pixels > 256) { + LaunchDepthwiseConv2dGPUSmall( + param, input, filter, output, stream); + } else if (block_pixels > 128) { + LaunchDepthwiseConv2dGPUSmall( + param, input, filter, output, stream); + } else { + LaunchDepthwiseConv2dGPUSmall( + param, input, filter, output, stream); + } +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace convolution { +namespace chanwise { + +// ===================================bwd data================================== +#define LAUNCH(type, type2) \ + if (param.flt_h == 3 && param.flt_w == 3) { \ + LaunchDepthwiseConv2dGPUSmall< \ + type, type2, DepthwiseConv2dDirection::DIRECTION_BACKWARD, 3, \ + 3>(param, dst_grad, flt, src_grad, stream); \ + } else { \ + LaunchDepthwiseConv2dGPUSmall< \ + type, type2, DepthwiseConv2dDirection::DIRECTION_BACKWARD, -1, \ + -1>(param, dst_grad, flt, src_grad, stream); \ + } + +template <> +void run_bwd_data_small(float* src_grad, const float* dst_grad, + const float* flt, const Param& param, + cudaStream_t stream) { + LAUNCH(float, float2); +} + +#if CUDA_VERSION >= 9000 +template <> +void run_bwd_data_small(__half* src_grad, const __half* dst_grad, + const __half* flt, const Param& param, + cudaStream_t stream) { + LAUNCH(__half, __half2); +} +#endif +#undef LAUNCH + + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/chanwise/kern.cuh b/dnn/src/cuda/convolution/chanwise/kern.cuh new file mode 100644 index 00000000..af19ad80 --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/kern.cuh @@ -0,0 +1,77 @@ +/** + * \file dnn/src/cuda/convolution/chanwise/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/utils.cuh" + +#include +#include + +#if MEGDNN_CC_HOST +#include "src/cuda/convolution/helper.h" +#endif + +namespace megdnn { +namespace cuda { +namespace convolution { +namespace chanwise { + + struct Param { + uint32_t batch, src_chl, src_h, src_w, + chl_mul, flt_h, flt_w, + out_h, out_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w; +#if MEGDNN_CC_HOST + static Param from_fwd_args(const ForwardSizeArgs &args) { +#define U(v) static_cast(v) + auto &&src = args.src_layout->shape; + auto &&dst = args.dst_layout->shape; + auto &&fm = args.filter_meta; + size_t c_pos, hw_pos; + if (fm.format == param::Convolution::Format::NCHW) { + c_pos = 1; + hw_pos = 2; + } else { + c_pos = 3; + hw_pos = 1; + } + return { + U(src[0]), U(src[c_pos]), U(src[hw_pos]), U(src[hw_pos+1]), + U(fm.ocpg), U(fm.spatial[0]), U(fm.spatial[1]), + U(dst[hw_pos]), U(dst[hw_pos+1]), + U(fm.padding[0]), U(fm.padding[1]), + U(fm.stride[0]), U(fm.stride[1]), + U(fm.dilation[0]), U(fm.dilation[1]), + }; +#undef U + } +#endif + }; + + template + void run_bwd_data_small(T *src_grad, const T *dst_grad, const T *flt, + const Param ¶m, cudaStream_t stream); + + template + void run_bwd_data(T *src_grad, const T *dst_grad, const T *flt, + const Param ¶m, cudaStream_t stream); + + template + void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, + const Param ¶m, cudaStream_t stream); + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution/chanwise/kern_helper.cuh b/dnn/src/cuda/convolution/chanwise/kern_helper.cuh new file mode 100644 index 00000000..da97fdad --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/kern_helper.cuh @@ -0,0 +1,55 @@ +/** + * \file dnn/src/cuda/convolution/chanwise/kern_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.cuh" +#include "megdnn/dtype.h" + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace convolution { +namespace chanwise { + + /*! + * \brief return a / b and set mod to a % b + */ + __device__ __forceinline__ uint32_t div_mod( + uint32_t a, uint32_t b, uint32_t &mod) { + uint32_t ret = a / b; + mod = a - ret * b; + return ret; + } + + /*! + * \brief copy a 2D matrix by all threads in a block + * \param rs row stride + */ + template + __device__ __forceinline__ void block_memcpy( + T *dst, const T *src, uint32_t size) { + for (uint32_t i = threadIdx.x; i < size; i += blockDim.x) { + dst[i] = src[i]; + } + __syncthreads(); + } + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution/chanwise/launch_config.cpp b/dnn/src/cuda/convolution/chanwise/launch_config.cpp new file mode 100644 index 00000000..abc27999 --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/launch_config.cpp @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/convolution/chanwise/launch_config.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/convolution/chanwise/launch_config.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +int chanwise::GetFixedBlockSize1(int work_element_count, const void* func, + int dynamic_shared_memory_size, + int fixed_block_size) { + int block_count = 0; + + cuda_check(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &block_count, func, fixed_block_size, dynamic_shared_memory_size)); + block_count = std::min( + block_count * cuda::current_device_prop().multiProcessorCount, + DIVUP(work_element_count, fixed_block_size)); + + return block_count; +} + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/chanwise/launch_config.cuh b/dnn/src/cuda/convolution/chanwise/launch_config.cuh new file mode 100644 index 00000000..daca3a9e --- /dev/null +++ b/dnn/src/cuda/convolution/chanwise/launch_config.cuh @@ -0,0 +1,35 @@ +/** + * \file dnn/src/cuda/convolution/chanwise/launch_config.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +namespace chanwise { + +int GetFixedBlockSize1(int work_element_count, const void* func, + int dynamic_shared_memory_size, int fixed_block_size); + +template +int GetFixedBlockSize(int work_element_count, DeviceFunc func, + int dynamic_shared_memory_size, int fixed_block_size) { + return GetFixedBlockSize1(work_element_count, + reinterpret_cast(func), + dynamic_shared_memory_size, fixed_block_size); +} + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/cudnn_heuristic.cpp b/dnn/src/cuda/convolution/cudnn_heuristic.cpp new file mode 100644 index 00000000..04065b76 --- /dev/null +++ b/dnn/src/cuda/convolution/cudnn_heuristic.cpp @@ -0,0 +1,235 @@ +/** + * \file dnn/src/cuda/convolution/cudnn_heuristic.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./cudnn_heuristic.h" +#include "megdnn.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool convolution::PerformanceModelBase::args_is_proper( + const TensorLayout* x_layout, + const ConvolutionBase::CanonizedFilterMeta& filter) { + bool available = (x_layout->dtype == dtype::Float32() && + filter.format == param::Convolution::Format::NCHW && + filter.should_flip == 0 && filter.stride[0] == 1 && + filter.stride[1] == 1 && filter.spatial_ndim == 2 && + filter.dilation[0] == 1 && filter.dilation[1] == 1); + return available; +} + +bool convolution::PerformanceModelBase::predict_time_success( + const TensorLayout* x_layout, const ConvolutionBase::CanonizedFilterMeta& filter, + const ConvolutionType& conv_type, float** mask_p, float** time_pred_p, + size_t* output_dim_p) { + size_t layer_num; + const size_t* layers_dim; + size_t input_params[9]; + const float* matrices; + const float* biases; + const float* alpha; + const float* beta; + float* hidden_units; + + if (!(args_is_proper(x_layout, filter))) { + return false; + } + + if (!convolution::heuristic_params_available( + cuda::current_device_prop().major, + cuda::current_device_prop().minor, &layer_num, &layers_dim, + &matrices, &biases, &alpha, &beta, conv_type, &hidden_units, + time_pred_p, mask_p)) { + return false; + } + + input_params[0] = x_layout->shape[0]; + input_params[1] = x_layout->shape[1]; + input_params[2] = x_layout->shape[2]; + input_params[3] = x_layout->shape[3]; + input_params[4] = filter.ocpg; + input_params[5] = filter.spatial[0]; + input_params[6] = filter.spatial[1]; + input_params[7] = filter.padding[0]; + input_params[8] = filter.padding[1]; + + predict_time(layer_num, layers_dim, input_params, matrices, biases, alpha, + beta, hidden_units, *time_pred_p); + + *output_dim_p = layers_dim[layer_num - 1]; + + return true; +} + +void convolution::PerformanceModelBase::predict_time( + const size_t layer_num, const size_t* layers_dim, + const size_t* input_params, const float* matrices, const float* biases, + const float* alpha, const float* beta, float* hidden_units, + float* time_pred) { + size_t layer_ind; + size_t i, j; + const float *matrix_entry = matrices, *bias_entry = biases; + float *prev_entry, *next_entry = hidden_units; + size_t shape; + + for (j = 0; j < layers_dim[1]; ++j) { + for (i = 0; i < layers_dim[0]; ++i) { + next_entry[j] += + matrix_entry[j * layers_dim[0] + i] * input_params[i]; + } + next_entry[j] += bias_entry[j]; + next_entry[j] = element_ReLU(next_entry[j]); + } + prev_entry = next_entry; + next_entry += layers_dim[1]; + matrix_entry += layers_dim[0] * layers_dim[1]; + bias_entry += layers_dim[1]; + + for (layer_ind = 1; layer_ind < layer_num - 2; ++layer_ind) { + for (j = 0; j < layers_dim[layer_ind + 1]; ++j) { + for (i = 0; i < layers_dim[layer_ind]; ++i) { + next_entry[j] += matrix_entry[j * layers_dim[layer_ind] + i] * + prev_entry[i]; + } + next_entry[j] += bias_entry[j]; + next_entry[j] = element_ReLU(next_entry[j]); + } + prev_entry = next_entry; + next_entry += layers_dim[layer_ind + 1]; + matrix_entry += layers_dim[layer_ind] * layers_dim[layer_ind + 1]; + bias_entry += layers_dim[layer_ind + 1]; + } + + for (j = 0; j < layers_dim[layer_num - 2]; ++j) { + for (i = 0; i < layers_dim[layer_num - 1]; ++i) { + time_pred[j] += matrix_entry[j * layers_dim[i]] * input_params[i]; + } + time_pred[j] += bias_entry[j]; + } + + shape = input_params[0] * input_params[1] * input_params[4] * + (input_params[2] + input_params[7] * 2 - input_params[5] + 1) * + (input_params[3] + input_params[8] * 2 - input_params[6] + 1) * + input_params[5] * input_params[6]; + for (i = 0; i < layers_dim[layer_num - 1]; ++i) { + time_pred[i] = std::exp2f(time_pred[i] * beta[i]) * (shape / alpha[i]); + } +} + +/* backward filter */ +void convolution::PerformanceModelBackwardFilter::gen_mask_backward_filter( + float* mask, const size_t output_dim, + const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdFilterDescs& D, + const size_t workspace_size_limit_in_bytes) { + size_t i; + size_t workspace_size; + for (i = 0; i < output_dim; ++i) { + mask[i] = -1.0f; + auto cudnnStat = cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc, + D.conv_desc.desc, D.grad_desc.desc, + static_cast(i), + &workspace_size); + if (cudnnStat == CUDNN_STATUS_SUCCESS && + workspace_size < workspace_size_limit_in_bytes) { + mask[i] = 1.0f; + } + } +} + +bool convolution::PerformanceModelBackwardFilter:: + get_algo_backward_filter_success( + const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdFilterDescs& D, + const size_t workspace_limit_in_bytes, + cudnnConvolutionBwdFilterAlgo_t* algo) { + float* mask; + size_t output_dim; + float* time_pred; + + if (!predict_time_success(args.src_layout, args.grad_filter_meta, + ConvolutionType::BACKWARD_FILTER, &(mask), + &(time_pred), &(output_dim))) { + return false; + } + + gen_mask_backward_filter(mask, output_dim, args, D, + workspace_limit_in_bytes); + + size_t i, selected = 0; + for (i = 0; i < output_dim; ++i) { + if (mask[i] > 0 && time_pred[i] < time_pred[selected]) { + selected = i; + } + } + *algo = static_cast(selected); + + return mask[selected] > 0; +} + +/* backward data */ +void convolution::PerformanceModelBackwardData::gen_mask_backward_data( + float* mask, const size_t output_dim, + const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdDataDescs& D, + const size_t workspace_size_limit_in_bytes) { + size_t i; + size_t workspace_size; + for (i = 0; i < output_dim; ++i) { + mask[i] = -1.0f; + auto cudnnStat = cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle->cudnn_handle(), D.filter_desc.desc, + D.diff_desc.desc, D.conv_desc.desc, D.grad_desc.desc, + static_cast(i), &workspace_size); + if (cudnnStat == CUDNN_STATUS_SUCCESS && + workspace_size < workspace_size_limit_in_bytes) { + mask[i] = 1.0f; + } + } +} + +bool convolution::PerformanceModelBackwardData::get_algo_backward_data_success( + const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes, + cudnnConvolutionBwdDataAlgo_t* algo) { + float* mask; + size_t output_dim; + float* time_pred; + + if (!predict_time_success(args.grad_layout, args.filter_meta, + ConvolutionType::BACKWARD_DATA, &mask, &time_pred, + &output_dim)) { + return false; + } + + gen_mask_backward_data(mask, output_dim, args, D, workspace_limit_in_bytes); + + size_t i, selected = 0; + for (i = 0; i < output_dim; ++i) { + if (mask[i] > 0 && time_pred[i] < time_pred[selected]) { + selected = i; + } + } + + // special case: + // if the filter shape in cudnnConvolutionBackwardData is too asymmetric, + // the performance of algo1 is dramatically reduced, + // we temporarily choose algo0. + if (args.filter_meta.spatial[0] / args.filter_meta.spatial[1] > 32 || + args.filter_meta.spatial[1] / args.filter_meta.spatial[0] > 32) { + selected = 0; + } + *algo = static_cast(selected); + + return mask[selected] > 0; +} diff --git a/dnn/src/cuda/convolution/cudnn_heuristic.h b/dnn/src/cuda/convolution/cudnn_heuristic.h new file mode 100644 index 00000000..54cfc742 --- /dev/null +++ b/dnn/src/cuda/convolution/cudnn_heuristic.h @@ -0,0 +1,86 @@ +/** + * \file dnn/src/cuda/convolution/cudnn_heuristic.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/convolution/backward_data/algo.h" +#include "src/cuda/convolution/backward_filter/algo.h" + +namespace megdnn { +namespace cuda { +namespace convolution { + +enum class ConvolutionType { + FORWARD = 0, + BACKWARD_FILTER = 1, + BACKWARD_DATA = 2 +}; + +bool heuristic_params_available( + int cuda_major, int cuda_minor, size_t* layer_num_p, + const size_t** layers_dim_p, const float** matrices_p, + const float** biases_p, const float** alpha_p, const float** beta_p, + const ConvolutionType& conv_type, float** hidden_units_p, + float** time_pred_p, float** mask_p); + +class PerformanceModelBase { +public: + static float element_ReLU(float element) { + return element > 0.0 ? element : 0.0; + } + static bool predict_time_success(const TensorLayout* x_layout, + const ConvolutionBase::CanonizedFilterMeta& filter, + const ConvolutionType& conv_type, + float** mask_p, float** time_pred_p, + size_t* output_dim_p); + +private: + static bool args_is_proper( + const TensorLayout* x_layout, + const ConvolutionBase::CanonizedFilterMeta& filter); + static void predict_time(const size_t layer_num, const size_t* layers_dim, + const size_t* input_params, const float* matrices, + const float* biases, const float* alpha, + const float* beta, float* hidden_units, + float* time_pred); +}; + +class PerformanceModelBackwardFilter : public PerformanceModelBase { +public: + static bool get_algo_backward_filter_success( + const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdFilterDescs& D, const size_t workspace_limit_in_bytes, + cudnnConvolutionBwdFilterAlgo_t* algo); + +private: + static void gen_mask_backward_filter( + float* mask, const size_t output_dim, + const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdFilterDescs& D, + const size_t workspace_limit_in_bytes); +}; + +class PerformanceModelBackwardData : public PerformanceModelBase { +public: + static bool get_algo_backward_data_success( + const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes, + cudnnConvolutionBwdDataAlgo_t* algo); + +private: + static void gen_mask_backward_data( + float* mask, const size_t output_dim, + const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, + const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes); +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/convolution/get_params.cpp b/dnn/src/cuda/convolution/get_params.cpp new file mode 100644 index 00000000..8697223e --- /dev/null +++ b/dnn/src/cuda/convolution/get_params.cpp @@ -0,0 +1,754 @@ +/** + * \file dnn/src/cuda/convolution/get_params.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/convolution/cudnn_heuristic.h" +#include "megdnn.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool convolution::heuristic_params_available( + int cuda_major, int cuda_minor, size_t* layer_num_p, + const size_t** layers_dim_p, const float** matrices_p, + const float** biases_p, const float** alpha_p, const float** beta_p, + const ConvolutionType& conv_type, float** hidden_units_p, + float** time_pred_p, float** mask_p) { + MEGDNN_MARK_USED_VAR(cuda_major); + MEGDNN_MARK_USED_VAR(cuda_minor); + MEGDNN_MARK_USED_VAR(layer_num_p); + MEGDNN_MARK_USED_VAR(layers_dim_p); + MEGDNN_MARK_USED_VAR(matrices_p); + MEGDNN_MARK_USED_VAR(biases_p); + MEGDNN_MARK_USED_VAR(alpha_p); + MEGDNN_MARK_USED_VAR(beta_p); + MEGDNN_MARK_USED_VAR(conv_type); + MEGDNN_MARK_USED_VAR(hidden_units_p); + MEGDNN_MARK_USED_VAR(time_pred_p); + MEGDNN_MARK_USED_VAR(mask_p); + +#if CUDNN_MAJOR == 6 && CUDNN_MINOR == 0 + + float cuda5_2_BACKWARD_FILTER_time_pred[7] = {0.0f}; + float cuda5_2_BACKWARD_FILTER_mask[7] = {0.0f}; + float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f}; + const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 7}; + const static float cuda5_2_BACKWARD_FILTER_matrices[336] = { + 3.499478e-03, 1.353932e-02, -1.316529e-01, 1.006798e-01, + 1.249662e-02, -3.591197e-01, -4.299506e-01, -3.613592e-01, + -3.783917e-01, -4.249511e-01, 6.287370e-03, -2.861480e-03, + 3.128614e-03, 8.496360e-03, 5.568272e-01, 1.965293e-01, + -6.205962e-02, -1.999864e-01, 9.333656e-03, -6.377945e-02, + 6.122595e-02, 1.122032e-01, -1.683744e-02, -9.395520e-02, + -2.953549e-02, -2.772853e-02, -2.892097e-02, 3.200796e-03, + 5.553298e-03, 6.707606e-01, 3.111190e-01, -5.293804e-01, + -8.127835e-02, -5.839296e-02, 9.633666e-02, 5.957389e-02, + -7.131222e-02, 4.057650e-02, 4.311656e-02, -1.456163e-02, + 5.683148e-02, 6.175192e-02, 9.331264e-02, 9.957494e-02, + 5.202487e-02, 0.0, 0.0, -7.725500e-14, + -8.058319e-17, 0.0, 0.0, 0.0, + 0.0, 0.0, 1.988015e-04, -1.530555e-01, + 3.629641e-03, -1.238047e-03, 1.692593e-02, 3.404703e-01, + 5.441420e-01, -3.275000e-01, -3.742920e-01, -1.714999e-01, + 1.979161e-02, 5.019676e-02, 1.406423e-02, -4.360787e-02, + -5.948093e-03, 1.522342e-01, 1.012455e-02, 5.666151e-02, + -7.033888e-05, 1.519375e-02, -2.360136e-02, -5.682724e-04, + -2.552732e-02, 2.329080e-01, 3.437024e-01, 4.054402e-01, + 3.379739e-01, 1.566344e-03, 3.172801e-02, -1.336258e-02, + 1.401075e-02, 2.876163e-02, 1.293039e+00, 7.118387e-01, + 2.966451e-01, 4.372724e-01, -2.286311e-02, -6.896693e-03, + 3.156468e-02, 3.829155e-02, -9.890525e-04, 1.836302e-02, + 2.394343e-02, 4.963258e-02, 4.368515e-02, 2.950634e-03, + 1.129842e-02, 7.078686e-01, 3.193808e-01, 9.759862e-03, + 2.906150e-01, 1.806232e-01, 1.396071e-01, 2.047469e-01, + -2.561368e-01, -3.322504e-01, 7.250011e-02, -3.389789e-02, + -1.372720e-02, 0.0, -1.690562e-01, -1.013354e-01, + -1.920926e-02, 1.018956e-01, 2.467915e-02, 4.451101e-02, + -4.139300e-02, -1.031867e-02, -5.686982e-03, 2.993172e-01, + 1.746564e-02, -3.393853e-20, 1.905611e-02, -5.220098e-02, + 4.550828e-02, 8.211702e-02, -2.850403e-03, -2.816908e-01, + 6.826700e-02, -1.102444e-02, 7.373374e-03, 9.173237e-03, + -6.144243e-03, 0.0, -1.675391e-02, 2.949211e-02, + -1.925736e-02, 2.259453e-02, 6.339108e-02, -1.233638e-01, + -1.239254e-02, -9.204817e-03, -6.979109e-02, -2.015045e-02, + -1.624232e-02, 0.0, 8.557694e-02, -2.066801e-02, + 2.876340e-01, -1.265177e-01, 7.225822e-03, 7.337274e-02, + -4.342360e-02, -1.974944e-01, -6.721890e-03, -4.495411e-02, + -3.655335e-02, 0.0, -4.551398e-01, 8.440251e-02, + -2.404170e-01, 1.250752e-01, 1.646416e-03, 9.063166e-02, + 2.506036e-02, 8.455078e-03, -1.908465e-02, 6.791655e-03, + 2.511951e-02, 0.0, 7.265597e-03, -1.285137e-03, + -3.404747e-04, 8.924944e-03, 4.234224e-03, -1.186513e-02, + 2.454471e-02, 9.120111e-04, 2.120904e-02, -5.555666e-03, + -1.493565e-02, 0.0, 2.764972e-03, -6.132948e-04, + 6.180623e-03, 3.238724e-03, -1.073131e-02, -1.342798e-04, + 8.969568e-02, 1.010931e-01, -1.038349e-02, -9.198243e-02, + 4.724314e-02, 0.0, 1.175188e-02, -6.051729e-02, + -2.525244e-03, -1.566657e-01, -1.447370e-02, 1.747005e-01, + 1.078679e-01, 2.556116e-01, 3.880575e-02, 9.777729e-03, + 1.078563e-01, 0.0, 4.525005e-01, 8.311278e-03, + 8.198996e-02, -2.884443e-01, -1.808732e-02, -3.114621e-02, + 1.732809e-02, 2.442103e-01, 3.329617e-02, 8.462872e-03, + 6.775563e-02, -7.453864e-19, 1.846050e-01, 2.739331e-02, + 1.029433e-01, -2.251960e-01, 3.331415e-02, -2.261097e-02, + 3.815529e-02, -5.755350e-02, -8.908589e-03, -4.526101e-02, + 1.555560e-02, 0.0, 2.347023e-02, -1.399980e-01, + -2.699343e-02, 2.168779e-02, 2.629133e-03, 3.232189e-02, + 3.693172e-02, -9.767429e-02, 2.461806e-02, 1.045579e-01, + 5.808600e-02, 0.0, -1.331031e-02, 3.555656e-03, + -9.530113e-02, -1.961061e-02, -1.579800e-02, -7.582582e-02, + -3.099381e-02, 9.698432e-01, 7.805698e-01, 1.542833e-01, + -1.025053e-01, -7.509316e-04, -1.675225e-02, -7.818724e-03, + -2.718012e-01, 8.506276e-01, 3.869322e-02, 2.732933e-02, + -4.932691e-02, 7.077541e-01, 4.385699e-01, 8.550947e-02, + -1.737943e-01, -1.007005e-02, 1.884576e-02, 6.328513e-02, + -2.711761e-01, 1.054725e+00, -1.001195e-02, 6.876359e-02, + -4.647969e-01, 7.618478e-01, 1.170148e+00, 5.507177e-02, + -1.284761e-01, 2.255174e-02, 5.041638e-03, 2.431494e-01, + -2.259419e-01, 6.318219e-01, 4.526694e-02, -1.068190e-01, + 9.181661e-05, 7.900977e-01, 5.499427e-01, 2.147153e-02, + -1.855706e-01, -6.816355e-03, 2.600182e-02, 7.784649e-02, + -2.902775e-01, 9.821153e-01, -1.705817e-02, 9.162355e-02, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 4.378970e-02, 7.106545e-01, 5.512720e-01, 1.076883e-01, + -3.036375e-01, 4.190212e-02, -1.192542e-02, 1.002918e-01, + -2.498885e-01, 6.789825e-01, -1.278644e-01, 8.962566e-02, + -4.231460e-02, 2.334089e-01, 3.083326e-03, 2.404322e-02, + -2.668908e-01, 3.057625e-03, -1.283901e-03, 1.349618e-02, + -4.993697e-02, 6.061308e-01, -9.689163e-02, 1.609056e-01}; + const static float cuda5_2_BACKWARD_FILTER_biases[31] = { + 3.927711e-02, 4.658543e-01, 3.737917e-02, -4.173907e-02, + 6.516264e-04, 0.0, 1.141180e+00, 5.656777e-03, + -1.466774e-01, -3.637813e-01, 3.348432e-02, -2.374912e-01, + 1.856181e-01, 1.458818e+00, 1.436140e+00, 1.708800e-01, + 3.663654e-02, 2.147604e-02, 5.249544e-02, 9.389526e-02, + -7.182905e-02, 2.513293e+00, -6.255527e-02, -1.452608e-01, + -7.379941e-01, -5.884537e-01, -6.324590e-01, -6.180407e-01, + 0.0, -1.712828e-01, -2.353933e-01}; + const static float cuda5_2_BACKWARD_FILTER_alpha[7] = { + 2.189385e+08, 1.987406e+08, 6.368552e+07, 2.164986e+08, + 2.000000e+08, 3.611623e+08, 8.509315e+06}; + const static float cuda5_2_BACKWARD_FILTER_beta[7] = { + 1.558573e+00, 1.825239e+00, 1.782366e+00, 1.772095e+00, + 2.000000e+00, 1.856787e+00, 1.625270e+00}; + + float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f}; + float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f}; + float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f}; + const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6}; + const static float cuda5_2_BACKWARD_DATA_matrices[324] = { + 1.090385e-03, -9.525486e-02, 2.116694e-02, 8.324536e-03, + 2.443915e-03, -1.486993e-03, 1.996945e-01, -3.490458e-02, + -2.909729e-01, -4.403929e-01, 3.302580e-03, 8.758115e-03, + 2.016278e-03, 5.139519e-03, 6.631530e-01, 4.163170e-01, + -2.275565e-01, -1.927734e-01, 4.901680e-02, 3.499708e-02, + 4.430823e-02, -6.245822e-01, 2.489910e-02, 2.943479e-01, + 3.011928e-01, -6.154800e-02, -6.945755e-02, 2.156114e-02, + -2.706529e-02, 2.254039e-02, -2.130969e-01, -1.711698e-03, + 3.185264e-01, 3.669779e-01, 2.366176e-01, 2.016553e-01, + 1.742197e-04, 8.993217e-04, -3.757331e-01, -1.517802e-01, + 1.150989e-03, 4.397022e-01, 2.472478e-01, -5.120142e-01, + -5.310764e-01, -2.185705e-02, -1.019608e-02, -1.484592e-01, + -1.720972e-01, 3.073631e-02, 1.679189e-02, 9.030435e-03, + -4.171251e-03, -7.412981e-03, 3.670006e-02, 2.704583e-02, + 1.162922e-01, 8.629673e-02, -1.661878e-01, -1.722751e-01, + -2.494859e-01, 6.303188e-02, 2.379866e-03, -9.154570e-02, + -8.703206e-02, 3.478937e-02, 2.733189e-02, -6.598901e-02, + -2.212522e-02, -3.853705e-02, 2.827537e-02, 2.944724e-02, + 1.588451e-02, 2.663488e-02, 1.933236e-02, 3.978135e-02, + 1.509624e-02, 1.144023e+00, 7.680039e-01, 4.072323e-01, + 3.243737e-01, 4.177893e-02, 4.054888e-02, 1.758260e-01, + 1.351026e-01, 2.773634e-02, 8.728213e-02, 1.938261e-01, + -1.641249e-02, 7.889663e-02, 4.266707e-04, 6.022587e-04, + 6.884130e-04, 2.244700e-04, -3.188357e-01, 1.903596e-01, + 3.979538e-01, -2.875198e-01, -5.881550e-01, -1.732513e-02, + 2.107770e-02, -2.415357e-02, 5.184836e-02, 2.633666e-03, + -4.351313e-01, -3.523280e-01, -1.124865e-01, -5.509025e-02, + -2.874137e-03, -2.260433e-03, 5.087418e-03, 2.825674e-03, + 4.565214e-03, 1.520132e-03, -1.722531e-03, -1.287867e-04, + 1.223576e-03, -5.230475e-04, -2.300250e-03, -6.684309e-03, + -7.956048e-03, -3.028432e-03, 2.238011e-02, -1.166453e-02, + 6.994838e-02, 5.585106e-03, -9.814836e-03, -4.010206e-03, + -3.232308e-03, -1.020571e-02, -1.587651e-02, 6.942352e-02, + 6.370817e-01, 5.906755e-02, -3.062441e-03, 9.914325e-02, + 2.335527e-01, -4.718621e-03, -2.132248e-02, 3.841487e-02, + 7.563891e-02, -7.599686e-02, 1.408871e-01, 5.740594e-02, + 1.902002e-01, 2.145507e-01, 3.427162e-02, 3.367433e-02, + 2.967569e-01, 2.863470e-02, 3.392174e-02, 3.514072e-02, + -1.441963e-01, -4.797359e-02, -5.965770e-03, 1.214167e-01, + 0.0, 0.0, -4.498340e-06, -1.828862e-07, + 0.0, 1.093948e-12, -2.601859e-06, 0.0, + -9.811162e-09, -2.785148e-06, 0.0, -2.360134e-27, + -1.110723e-01, -1.570218e-01, -4.062234e-02, -7.606770e-02, + 5.144730e-01, 9.398572e-02, 1.906881e-01, 1.747961e-02, + 1.106279e-01, -1.254419e-01, 6.205062e-01, -5.617496e-02, + -1.629532e-01, -1.042091e-01, -1.413646e-01, 1.433934e-01, + 1.425548e-01, 2.505819e-02, 5.484238e-04, -9.254320e-02, + 1.448994e-01, 3.132954e-02, -1.425708e-01, -1.685494e-02, + -3.513211e-01, -1.992232e-01, -1.081804e-01, 4.960524e-02, + -5.546688e-01, -1.675645e-02, -3.610602e-02, 2.780567e-02, + 2.227647e-01, 4.038066e-02, -6.002745e-01, -1.275032e-01, + -1.026016e-01, -2.635376e-01, 2.059869e-02, -8.100250e-02, + 8.695480e-02, -4.293829e-02, -1.870224e-02, 7.269356e-02, + 3.979762e-02, 3.270284e-02, 1.190808e-01, -1.059370e-01, + 1.286611e-02, 3.927987e-02, 7.228687e-03, 2.264480e-02, + -1.119717e-01, 8.701903e-02, 2.064170e-02, 5.297894e-02, + 9.965703e-03, 1.206108e-02, -5.411500e-02, -5.476563e-02, + -1.837980e-01, -7.351980e-01, -1.781217e-01, 1.473823e-01, + -4.530039e-01, -3.604104e-02, 2.418269e-02, 2.903621e-02, + 4.367216e-01, -5.112789e-02, -3.706729e-01, -2.049569e-01, + -9.153855e-02, -1.008104e-01, -1.009935e-02, -1.033947e-01, + 5.495172e-02, 1.323372e-02, -5.191914e-02, -1.545710e-02, + 3.271207e-02, 1.939050e-02, -3.092350e-02, 7.518642e-02, + -5.528467e-03, 8.568556e-02, 1.924936e-02, 1.007434e+00, + -6.850208e-07, 5.599304e-01, 3.076834e-01, -4.312680e-01, + 7.534813e-02, -3.293671e-02, 5.830373e-03, -2.450454e-02, + -3.698347e-04, -8.712796e-03, 4.009782e-01, 1.215293e+00, + -5.273760e-07, 2.344936e-01, 1.927198e-01, -3.006327e-01, + -2.927265e-02, -8.696410e-03, -2.446414e-02, 1.890189e-02, + 3.553152e-03, -1.651816e-02, 2.438239e-01, 6.245783e-01, + 1.809883e-07, 3.264363e-01, 7.772639e-01, -2.954962e-01, + 2.704587e-02, -3.836469e-02, -4.457633e-01, 1.726713e-02, + 5.172309e-03, 1.289187e-02, 5.472647e-01, 6.243305e-01, + -4.123602e-08, 4.334479e-01, 7.573158e-02, -2.572208e-01, + 5.492910e-02, -9.502222e-03, -2.104075e-01, -3.131663e-02, + 2.312713e-03, 3.963990e-02, 4.713630e-01, 8.256559e-01, + -2.583514e-08, 4.528451e-01, 7.318445e-02, -2.987004e-01, + 8.577114e-02, -2.907754e-02, -5.389895e-02, 8.495960e-02, + -1.558219e-04, 3.880079e-02, 4.180317e-01, 5.884213e-01, + 3.963620e-07, 4.769594e-01, 3.800152e-01, -3.191836e-01, + -1.669163e-01, 8.362461e-04, -1.668053e-01, -9.146041e-02}; + const static float cuda5_2_BACKWARD_DATA_biases[30] = { + 1.238052e+00, 7.745910e-01, 3.356679e-01, -7.175566e-02, + 1.497247e+00, 3.300638e-03, 2.789130e-01, -8.312362e-02, + -7.829870e-02, -3.456568e-01, 1.328189e+00, -2.689771e-01, + 9.444705e-03, -1.149580e-01, 4.422197e-01, 2.072980e+00, + 0.0, 4.782698e-01, -1.116326e+00, 7.193607e-01, + 2.938375e-02, 1.465170e-02, 8.513468e-02, 6.830001e-02, + 4.035618e-01, 1.607704e-01, 9.502214e-01, 6.022118e-01, + 2.584324e-01, 7.981322e-01}; + const static float cuda5_2_BACKWARD_DATA_alpha[6] = { + 1.997689e+08, 3.799992e+08, 6.843723e+07, 1.140762e+08, + 5.562133e+08, 3.324116e+08}; + const static float cuda5_2_BACKWARD_DATA_beta[6] = { + 1.537834e+00, 1.587649e+00, 1.844705e+00, 1.671656e+00, + 1.672516e+00, 1.705950e+00}; + + float cuda5_2_FORWARD_time_pred[8] = {0.0f}; + float cuda5_2_FORWARD_mask[8] = {0.0f}; + float cuda5_2_FORWARD_hidden_units[24] = {0.0f}; + const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 8}; + const static float cuda5_2_FORWARD_matrices[348] = { + -9.209032e-02, -1.659105e-01, -5.965192e-02, -2.153863e-02, + 8.719379e-02, -3.499233e-02, 7.201853e-03, -1.419160e-02, + -1.818457e-04, -3.145495e-01, 1.526620e-03, -3.928741e-03, + -2.569693e-03, 3.410484e-03, 2.167806e-01, 1.747067e-01, + -2.598841e-01, -3.055519e-01, 5.274500e-04, -9.025287e-03, + -2.483256e-02, 4.541647e-02, 7.308841e-02, -4.819591e-01, + -4.753071e-01, -1.471946e-02, 5.257137e-03, 2.392092e-03, + -1.222254e-02, 1.609546e-02, -3.770980e-03, 1.646060e-02, + 1.753314e-02, 1.508273e-02, 9.316003e-03, -5.777596e-04, + -2.694935e-05, 1.604315e-03, -1.762570e-02, -4.887820e-01, + 4.957791e-03, 2.363977e-01, 3.638881e-01, -4.731908e-01, + -5.269557e-01, -1.159047e-03, 1.838379e-02, -1.427773e-01, + -1.495254e-01, 1.330812e-01, 3.283872e-01, 3.582126e-01, + -1.175109e-01, -1.454948e-01, 2.369200e-02, 1.493328e-02, + 3.108240e-02, 3.270133e-02, -6.615507e-01, 3.380858e-01, + 3.704230e-01, 8.769190e-02, -6.377754e-02, 4.325379e-02, + -2.027540e-03, -1.402376e-01, -9.008316e-02, -2.559709e-03, + -8.711295e-02, -9.627704e-02, -1.539383e-01, -1.632525e-01, + 3.015039e-02, 3.144164e-02, 6.656437e-02, 5.488716e-02, + 1.877632e-01, 5.748791e-01, 3.917130e-01, 2.071713e-01, + 2.771358e-01, -5.960735e-02, 1.106716e-02, 5.781374e-02, + 6.840285e-03, 2.902341e-02, -3.347534e-01, -1.212164e-01, + -8.089989e-02, -1.384973e-01, 1.251527e-02, -2.644526e-01, + 6.949010e-02, 2.681785e-02, 1.081700e-01, -3.502952e-02, + 3.512865e-01, -9.033766e-02, 2.017496e-02, 2.095562e-02, + 1.330583e-02, 2.582395e-02, -2.550245e-03, -1.596605e-03, + -4.966798e-01, -5.384876e-01, -3.006902e-01, -2.735094e-01, + 2.044184e-02, 3.490414e-01, 1.717040e-02, 6.914880e-03, + 1.496788e-02, -7.078647e-02, 6.652176e-02, 6.768194e-03, + -3.086404e-02, 1.317981e-01, -5.902661e-02, -8.681632e-02, + -6.622906e-02, 1.597742e-01, 3.700355e-03, 1.707309e-02, + -5.229016e-02, 2.836531e-02, 9.072421e-03, -1.104825e-01, + 1.009224e-02, -1.915519e-02, -2.592222e-02, -9.112109e-02, + -2.824950e-02, 5.274639e-01, 1.052709e-01, 1.325189e-02, + 3.486569e-01, 1.155336e-01, 7.854062e-02, 1.637263e-02, + -1.599528e-01, 1.090762e-01, 2.625560e-02, 8.724683e-02, + 3.858089e-02, -5.696925e-01, -2.280933e-01, -3.096054e-02, + -5.547203e-01, -6.229282e-02, -1.009606e-01, 5.365341e-02, + 1.673071e-01, -1.734997e-01, -2.949879e-02, -2.640804e-01, + 4.783161e-02, -4.411741e-01, -1.495569e-01, -1.043236e-02, + -2.952088e-01, -2.866718e-02, 4.253592e-02, 3.828135e-02, + 7.448777e-02, -2.757399e-02, -6.067163e-02, -2.007495e-01, + -3.468005e-02, -1.678551e-01, -2.086982e-02, -2.114448e-02, + -2.844830e-02, 3.823385e-03, 8.453450e-03, 1.447659e-03, + 5.760803e-02, 7.803936e-02, -7.363023e-02, -1.894736e-03, + 6.325649e-02, 1.527100e-02, -4.378622e-02, 3.171223e-03, + 8.858634e-01, 7.191087e-02, 2.045580e-01, -3.890414e-03, + -7.661989e-02, 2.667563e-02, -2.549908e-02, -9.384236e-02, + -4.146666e-02, 2.281848e-01, 7.052436e-02, 1.180828e-03, + 1.976338e-01, 1.647339e-02, -2.741527e-02, 1.641885e-02, + -1.197201e-01, -3.670282e-02, 1.672286e-01, 5.267144e-02, + 8.803396e-02, 4.463083e-01, -8.939818e-03, 4.523633e-03, + -1.554685e-01, -1.392173e-02, 4.290194e-03, -9.498623e-03, + -2.200229e-02, -1.022839e-01, 1.553784e-02, 4.006403e-02, + -8.901481e-02, 1.353742e-01, -6.176645e-02, 2.818892e-03, + 4.842044e-02, 1.031219e-02, 4.689164e-03, 2.677023e-01, + -1.331718e-02, 2.130043e-01, 7.004514e-03, -5.422973e-01, + 7.450043e-03, 4.017003e-01, -9.216257e-03, -2.551504e-02, + -2.416791e-01, -1.451814e-01, -1.796521e-01, -1.749250e-02, + 9.023457e-02, 9.444007e-02, -5.293583e-03, -1.027239e-01, + 1.017421e-02, 1.213706e-01, -3.460192e-02, 8.999067e-03, + -1.110771e-01, 2.168397e-01, -4.417743e-02, 8.891370e-02, + -1.271863e-01, -7.239018e-02, -1.346174e-02, 5.777563e-02, + 7.088694e-02, 6.467304e-02, 7.867605e-01, -2.014701e-01, + 1.461604e-01, -6.571004e-02, 6.528026e-01, 6.720600e-01, + 4.151264e-01, -6.271046e-03, -1.568682e-02, 2.438027e-01, + 6.112317e-02, 8.257028e-02, 8.817917e-01, -1.995129e-01, + 9.260281e-02, -6.511735e-02, 6.132895e-01, 5.789503e-01, + 3.354024e-01, 1.621681e-04, -1.380093e-02, 2.710598e-01, + 1.104726e-01, 5.625401e-02, 7.417016e-01, -2.523506e-01, + 1.436054e-01, -7.903862e-02, 5.858161e-01, 4.287509e-01, + 5.370684e-01, -9.449220e-02, -9.393471e-03, 3.037375e-01, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 4.773019e-01, -2.101818e-02, 2.934896e-01, -4.207794e-01, + -2.892000e-01, -1.383682e-02, 3.842597e-01, 5.408122e-01, + -1.901669e-01, -5.255229e-02, 3.103573e-01, 7.447528e-01, + 1.010295e-01, 5.580491e-03, 4.166604e-01, -2.997382e-01, + -3.115629e-01, -2.585651e-02, 5.481771e-01, 6.307158e-01, + 4.869811e-01, 6.668988e-01, -8.661555e-02, 6.073793e-01, + 6.002924e-03, 1.855917e-02, 5.628079e-01, -1.967446e-01, + -1.365761e-01, -3.095432e-02, 6.461580e-01, 7.712716e-01, + 4.082011e-01, 8.834770e-02, -1.021050e-01, 4.353123e-01, + 2.292985e-01, -6.493770e-02, 2.730630e-01, -3.267927e-01, + -3.408634e-01, -6.609171e-02, 5.608538e-01, 7.108021e-01, + 3.760323e-01, 3.335001e-01, 8.168215e-02, 2.858790e-01}; + const static float cuda5_2_FORWARD_biases[32] = { + -1.021053e-02, 1.398318e+00, -2.447664e-01, 2.701163e-02, + 1.148165e+00, 6.030037e-01, 2.089586e-01, 5.609234e-02, + -4.842668e-01, 1.262153e-01, 2.643087e-01, 3.539835e-01, + 4.792117e-01, 4.310244e-02, 1.993983e+00, 2.597207e-01, + -2.811204e-01, 7.933383e-02, 1.056050e+00, 1.234862e+00, + 7.894841e-01, 2.019784e-01, -1.216166e-01, 8.840314e-01, + -3.542692e-01, -3.693904e-01, -2.181383e-01, 0.0, + -2.216420e-01, -1.602890e-01, 8.500483e-03, 2.072607e-01}; + const static float cuda5_2_FORWARD_alpha[8] = { + 2.549612e+08, 3.579459e+08, 1.927015e+08, 2.000000e+08, + 3.222185e+07, 8.748824e+07, 6.676129e+08, 2.775480e+08}; + const static float cuda5_2_FORWARD_beta[8] = { + 1.463412e+00, 1.553222e+00, 1.515109e+00, 2.000000e+00, + 2.117807e+00, 1.622262e+00, 1.626601e+00, 1.669380e+00}; + + if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 && + cuda_minor == 2) { + *layer_num_p = 4; + *hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units; + *layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim; + *matrices_p = cuda5_2_BACKWARD_FILTER_matrices; + *biases_p = cuda5_2_BACKWARD_FILTER_biases; + *alpha_p = cuda5_2_BACKWARD_FILTER_alpha; + *beta_p = cuda5_2_BACKWARD_FILTER_beta; + *time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred; + *mask_p = cuda5_2_BACKWARD_FILTER_mask; + } else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 && + cuda_minor == 2) { + *layer_num_p = 4; + *hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units; + *layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim; + *matrices_p = cuda5_2_BACKWARD_DATA_matrices; + *biases_p = cuda5_2_BACKWARD_DATA_biases; + *alpha_p = cuda5_2_BACKWARD_DATA_alpha; + *beta_p = cuda5_2_BACKWARD_DATA_beta; + *time_pred_p = cuda5_2_BACKWARD_DATA_time_pred; + *mask_p = cuda5_2_BACKWARD_DATA_mask; + } else if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 && + cuda_minor == 2) { + *layer_num_p = 4; + *hidden_units_p = cuda5_2_FORWARD_hidden_units; + *layers_dim_p = cuda5_2_FORWARD_layers_dim; + *matrices_p = cuda5_2_FORWARD_matrices; + *biases_p = cuda5_2_FORWARD_biases; + *alpha_p = cuda5_2_FORWARD_alpha; + *beta_p = cuda5_2_FORWARD_beta; + *time_pred_p = cuda5_2_FORWARD_time_pred; + *mask_p = cuda5_2_FORWARD_mask; + } else { + return false; + } + return true; +#endif +#if CUDNN_MAJOR == 5 && CUDNN_MINOR == 1 + + float cuda5_2_FORWARD_time_pred[9] = {0.0f}; + float cuda5_2_FORWARD_mask[9] = {0.0f}; + float cuda5_2_FORWARD_hidden_units[24] = {0.0f}; + const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 9}; + const static float cuda5_2_FORWARD_matrices[360] = { + 3.087359e-03, -2.629997e-01, 9.492566e-02, 4.831330e-02, + 4.493726e-02, -3.714851e-04, 8.981445e-02, -4.888808e-02, + -7.350665e-02, -7.113249e-01, 2.111573e-02, 6.259846e-02, + 2.931650e-02, 1.313162e-01, 1.926165e-02, 3.785147e-01, + 1.765169e-01, 6.096475e-02, 4.104461e-03, 8.656193e-03, + 1.102456e-02, 7.944959e-03, 4.644261e-02, -5.927094e-01, + -6.180425e-01, -4.314502e-01, -4.073743e-01, 3.077646e-02, + -1.029431e-01, 5.112506e-02, -8.541957e-02, 2.589677e-02, + -5.164597e-02, 1.186986e-01, -4.672555e-02, -6.755380e-02, + -2.806628e-04, 1.056535e-02, -1.438679e-01, -1.122842e-01, + 5.779694e-02, 1.705828e-01, 3.862250e-01, -1.106681e-01, + -5.471609e-02, -2.316525e-02, -4.610147e-02, 2.021985e-03, + -5.761939e-03, 1.209045e-01, -7.279532e-02, 9.754839e-02, + -6.032932e-02, -1.589997e-02, 1.985070e-03, 2.788936e-03, + -2.104690e-01, -2.731634e-01, 1.189841e-02, 2.144678e-01, + 1.771111e-01, -3.730702e-01, -3.886393e-01, -4.719765e-06, + -2.289832e-22, 0.0, 0.0, -7.619362e-33, + 0.0, 0.0, 0.0, 0.0, + 1.652513e-02, 2.785243e-02, 6.713332e-02, 3.292293e-02, + -7.087571e-01, 2.954406e-01, 2.942279e-01, 2.148153e-01, + 9.042904e-02, 3.337476e-02, 5.262762e-02, 1.355991e-01, + 6.802084e-02, 3.188081e-01, 1.053071e+00, 5.648708e-01, + 3.254285e-01, 3.829584e-01, -3.902937e-02, 8.569189e-04, + -6.860779e-03, -1.342737e-02, 9.002463e-04, 2.672171e-01, + 1.833601e-02, -4.791870e-02, -4.673452e-01, -5.951233e-04, + 1.327156e-02, 4.884608e-04, -6.395956e-04, -1.247312e-02, + 2.616015e-03, 2.045540e-02, 1.826517e-02, 2.752957e-02, + 4.864566e-03, 1.974226e-01, 8.022508e-02, 8.533795e-02, + 7.867660e-02, 1.206522e-02, 1.408663e-01, 8.814420e-29, + 2.803104e-02, -1.190598e-01, 4.397753e-01, 2.351956e-03, + 2.934275e-02, 1.909389e-02, -1.119068e-01, -5.117084e-02, + 6.178805e-03, -1.955722e-03, -4.881141e-02, 0.0, + -5.396824e-02, 1.768444e-02, -1.764243e-01, -1.029730e-02, + 3.943393e-02, -1.397969e-02, 9.628724e-02, -4.312754e-02, + -1.602866e-01, -1.405657e-02, 1.331697e-01, 0.0, + -2.396953e-02, 1.866630e-02, 3.267511e-02, -6.928004e-03, + 7.034376e-02, -6.569391e-02, -1.199368e-01, 2.414189e-02, + 3.878685e-02, 1.612695e-02, -9.410737e-02, 2.452490e-33, + -3.085373e-02, 1.452446e-02, 5.175281e-02, -2.379139e-02, + -5.039049e-02, 1.873454e-02, 9.242059e-02, -1.805802e-02, + -4.347714e-02, -3.853900e-02, 1.008241e-01, 0.0, + -9.480388e-03, 2.023331e-02, -6.792901e-03, -8.394149e-03, + -7.546303e-02, 6.270129e-03, -3.894017e-01, -4.973264e-02, + -1.555514e-01, -1.105092e-02, -1.950841e-01, -1.148950e-25, + -2.661943e-02, 9.485362e-02, -4.270326e-01, 7.918665e-03, + -1.816450e-01, -4.379404e-02, -3.889270e-02, -1.432468e-02, + 1.501353e-02, -3.272457e-02, -1.477906e-01, 0.0, + -1.104928e-01, 3.061369e-02, -1.783103e-01, -4.144012e-03, + -1.341517e-02, -8.905338e-02, -2.880624e-01, -4.843873e-02, + -8.718476e-02, -4.244976e-02, -5.811334e-02, 8.169911e-07, + 3.018601e-01, -6.359625e-02, -6.384093e-02, -2.376516e-03, + 1.381678e-01, 5.480919e-03, -1.754923e-02, 1.902135e-02, + 1.838670e-01, 1.829514e-02, 9.986089e-01, 0.0, + -3.820317e-02, -8.010733e-02, 2.023727e-01, -8.899641e-03, + -6.265503e-02, 2.848809e-01, -6.972601e-02, 9.673467e-02, + -6.779978e-02, -1.749464e-02, -1.618047e-01, 0.0, + 5.618134e-03, -7.931516e-02, -7.710180e-01, -5.023658e-03, + 2.721053e-02, 2.372581e-03, 1.131147e-01, 3.923619e-02, + 1.188756e-01, 6.569220e-02, 3.954504e-02, 4.407177e-06, + 3.772899e-02, -7.408679e-02, 2.722764e-01, 9.289873e-03, + -1.720112e-08, -1.111527e-10, -3.223340e-33, 0.0, + 0.0, 0.0, 0.0, 0.0, + -3.947499e-10, -1.125618e-07, 0.0, 0.0, + 7.252669e-01, -2.573835e-02, -3.086479e-03, 1.373577e-02, + -2.595616e-02, -1.071919e-01, -1.039699e-01, 4.686809e-01, + 6.939601e-01, 5.092673e-02, 8.983605e-01, 7.748492e-12, + 7.637465e-01, -5.160391e-02, 4.367014e-03, 5.456513e-03, + -1.755392e-02, -1.141231e-01, -9.624086e-02, 4.324957e-01, + 7.202701e-01, 5.805269e-02, 8.917692e-01, 5.552060e-13, + 6.970178e-01, -1.570065e-01, 3.382218e-02, -2.513156e-02, + -1.520863e-02, -1.164639e-01, -1.687423e-01, 4.522114e-01, + 5.808989e-01, 5.248518e-02, 8.544105e-01, 9.402750e-15, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 4.326442e-01, -5.917080e-02, 2.801385e-01, -2.795843e-02, + 1.264143e-02, -3.693263e-01, -1.749216e-01, 2.439530e-01, + 5.274415e-01, 6.522411e-01, 2.642505e-01, -1.186306e-22, + 4.592337e-01, -3.818674e-02, 1.983223e-02, -3.099717e-02, + 3.941813e-02, -5.257453e-01, -3.692166e-02, 2.670639e-01, + 6.403314e-01, 5.740828e-01, 2.307071e-01, -6.111520e-19, + 5.923415e-01, -1.620244e-01, -6.315269e-03, 1.360147e-01, + 3.776298e-02, -2.748910e-01, -9.679949e-02, 3.612375e-01, + 6.582589e-01, 1.544350e-01, 8.423274e-01, 0.0, + 4.770435e-01, -3.441220e-02, 7.110235e-02, 1.750984e-01, + -1.088923e-01, -3.269669e-01, -3.097497e-01, 3.498318e-01, + 6.162855e-01, 5.070065e-01, 4.478149e-01, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0}; + const static float cuda5_2_FORWARD_biases[33] = { + 2.354680e-01, 4.575782e-01, 6.988282e-01, 2.040031e-01, + 8.584012e-01, 8.249553e-02, 1.267146e+00, 0.0, + 3.549752e-01, -4.857582e-01, 1.279055e+00, 6.212520e-03, + 1.735605e+00, 1.737882e-01, 9.513135e-02, 1.042232e-01, + 2.587379e-02, 1.125817e-01, 4.899196e-01, 8.571400e-01, + 1.188120e+00, 1.079335e+00, 1.945481e+00, 0.0, + -4.535237e-01, -4.646283e-01, -2.796752e-01, 0.0, + -1.881813e-01, 6.431429e-02, 1.600823e-01, 3.773381e-01, + 0.0}; + const static float cuda5_2_FORWARD_alpha[9] = { + 2.371974e+08, 3.625653e+08, 1.961586e+08, 2.000000e+08, + 2.259449e+07, 1.865459e+07, 6.657476e+08, 2.487226e+08, + 2.000000e+08}; + const static float cuda5_2_FORWARD_beta[9] = { + 1.575003e+00, 1.656241e+00, 1.577959e+00, 2.000000e+00, + 2.396584e+00, 2.221534e+00, 1.692119e+00, 1.879424e+00, + 2.000000e+00}; + + float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f}; + float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f}; + float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f}; + const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6}; + const static float cuda5_2_BACKWARD_DATA_matrices[324] = { + 8.340252e-04, -7.066309e-02, 6.012942e-03, -8.961015e-04, + 5.308781e-02, 8.890389e-03, -1.695608e-02, -2.008141e-01, + -2.327795e-01, 1.816323e-03, 1.741969e-03, -4.547063e-01, + -3.278293e-01, 3.194534e-03, 5.590135e-01, 5.038606e-01, + -6.899682e-01, -6.846661e-01, 1.296691e-02, 9.286657e-03, + 6.076815e-02, 9.537656e-03, -1.845960e-01, 2.334390e-01, + 6.584574e-02, -1.502425e-01, -1.464556e-01, 2.582188e-02, + -2.801069e-01, 2.606461e-01, 5.094615e-02, 9.973006e-03, + -2.273075e-01, 1.013311e-01, -2.977537e-01, -3.584019e-01, + 1.550467e-02, -2.365348e-02, -2.361028e-01, -4.535604e-01, + -1.099842e-01, 3.337491e-02, 3.386805e-02, 5.759778e-02, + 5.773445e-02, -6.057084e-03, -5.215100e-03, -2.488342e-02, + 4.550520e-01, -6.358563e-03, -4.111410e-01, -2.748287e-01, + 6.576765e-01, 6.735925e-01, 1.382121e-02, 1.599379e-02, + 2.175570e-01, 4.235858e-01, -4.743209e-03, 8.406488e-01, + 5.463328e-01, 5.315352e-01, 5.759005e-01, -3.956826e-01, + 1.770215e-03, 4.242290e-03, 5.961310e-03, 2.629623e-03, + 3.968062e-01, 2.857247e-01, -3.694852e-01, -4.826791e-01, + -1.361759e-01, 1.741970e-02, 2.067235e-01, -3.166322e-02, + 1.676094e-02, 1.222352e-01, 3.594849e-01, 5.646787e-02, + 9.237770e-02, 2.705673e-02, 3.022863e-02, 2.661669e-01, + 1.342065e-01, 9.685011e-02, -4.619106e-01, -4.885407e-01, + -1.207667e-01, -3.344076e-02, 1.247313e-03, 9.397045e-04, + 3.326222e-03, 2.384325e-03, -5.191239e-01, 3.588830e-01, + 5.642326e-01, -2.458584e-01, -6.050721e-01, -5.983715e-04, + -3.112906e-04, -8.002273e-02, 2.754113e-01, 1.347607e-01, + 2.869407e-01, 3.228108e-01, 2.589051e-01, 2.689373e-01, + 2.097373e-03, -1.213292e-03, 2.289704e-02, 2.260412e-02, + -4.001153e-03, -3.886382e-02, 1.744227e-02, 1.228004e-03, + 5.637321e-02, 5.326664e-03, 5.775909e-02, -7.129682e-02, + 2.957929e-02, -3.619472e-02, -7.687800e-02, 2.551496e-01, + 2.791522e-02, -1.290575e-01, 7.948833e-02, 9.349618e-02, + 4.568452e-03, -2.620651e-01, 9.037835e-03, 1.652229e-01, + -1.035363e-02, -4.924298e-01, -1.359403e-01, -2.509044e-02, + 6.072426e-02, -1.067680e-01, 9.075266e-02, -5.669300e-01, + -5.016208e-02, -4.982992e-03, -4.493951e-01, 2.403491e-02, + -5.795595e-03, 8.214971e-02, 1.994753e-03, 2.271867e-03, + 8.008438e-03, -1.517102e-01, -2.790549e-02, 7.735109e-02, + -1.794875e-02, 1.122736e-02, -4.320173e-02, -9.230874e-03, + -4.703557e-02, -3.043727e-02, -1.645634e-01, -6.124438e-02, + 2.416326e-01, -2.548371e-01, 2.711228e-01, 2.171408e-01, + -1.613229e-02, -1.133995e-01, -5.881115e-01, 1.196182e-01, + -1.574013e-02, -2.309249e-02, -9.163861e-02, -1.243609e-03, + 2.755058e-03, -8.981592e-02, 4.023712e-02, 1.447185e-01, + 1.773491e-02, -4.728686e-02, 4.132702e-02, 4.325303e-02, + 9.868489e-02, -2.594438e-01, 1.111406e-02, 5.278649e-02, + -5.842348e-02, 7.532353e-02, -3.890866e-02, 7.389170e-03, + -8.200553e-02, -2.977651e-04, 2.846818e-01, -2.641009e-02, + -3.923972e-06, 1.683590e-06, 4.231356e-06, -1.460619e-05, + 1.480699e-05, -4.800242e-05, -3.605007e-05, 4.642337e-06, + -1.237117e-05, -6.065346e-05, 1.122525e-07, -4.718931e-05, + -4.836941e-02, 2.925190e-02, 5.125062e-02, -8.673830e-02, + 4.049347e-02, -1.281789e-01, 4.054615e-02, -1.102404e-01, + 1.797214e-02, 8.068577e-03, 9.849558e-02, 2.462221e-02, + -3.952334e-02, 7.078841e-02, 5.095275e-03, -5.172743e-03, + 1.358633e-01, -4.528875e-01, 4.454420e-01, -5.941349e-01, + -8.203693e-02, -2.733144e-01, -4.668098e-01, 2.087940e-01, + 2.732850e-01, 1.967585e-01, -1.648116e-02, -4.675763e-02, + -2.471467e-02, -3.507713e-02, 1.268763e-01, -4.777270e-04, + -6.884494e-02, -4.142293e-02, 4.568305e-01, -1.171813e-01, + 4.104385e-02, 4.123072e-01, 1.201161e-01, 5.688429e-02, + -6.769225e-02, 1.879334e-01, -1.869847e-01, 2.116578e-01, + 1.023851e-01, -7.956885e-03, 3.125194e-02, -3.698255e-02, + -1.742767e-02, 8.019327e-02, -2.414790e-01, 1.692867e-01, + -1.363161e-01, -2.991336e-02, 1.571377e-01, -4.675832e-05, + 3.410926e-02, -2.423313e-02, 3.784683e-01, 8.980562e-01, + 1.445573e-02, 3.742977e-01, -1.449231e-01, 5.250753e-02, + -9.320556e-02, 1.881413e-01, 1.525415e-01, 1.516415e-05, + -2.865472e-02, -4.090607e-02, 1.368707e-01, 1.152067e+00, + 3.926153e-02, 3.892255e-01, -3.988812e-01, 2.768721e-01, + 1.682807e-01, -8.165011e-02, 2.984257e-01, -2.310482e-05, + -1.301168e-01, -3.295192e-01, 1.955211e-01, 6.782165e-01, + -1.859493e-02, 5.047321e-01, -3.545281e-01, 6.802614e-01, + -2.701511e-02, 5.938844e-02, 1.288360e-01, 6.412582e-05, + 6.354152e-02, -2.929806e-01, 1.172161e-01, 5.812020e-01, + -1.526828e-03, 4.311178e-01, -1.572772e-01, 3.847064e-01, + -1.406437e-01, -8.771673e-02, 1.723672e-01, -2.926565e-05, + 1.170990e-01, -1.168602e-01, 2.353766e-01, 8.977429e-01, + 1.029375e-02, 4.529134e-01, -3.884215e-01, 2.041353e-01, + -2.684749e-02, 9.474846e-02, 1.718571e-01, 9.999280e-06, + -9.272413e-02, -1.050809e-01, 2.637663e-01, 6.296775e-01}; + const static float cuda5_2_BACKWARD_DATA_biases[30] = { + 2.758991e-01, 9.040871e-01, 6.578859e-01, 3.464146e-01, + -1.074793e-01, -1.111640e+00, -4.436951e-03, 1.027522e+00, + 5.782945e-02, -6.986979e-02, 1.183250e+00, -9.289587e-02, + 2.339573e-03, 2.321955e-01, 6.579675e-01, 9.597613e-01, + 4.900812e-02, 1.206250e-01, 1.320550e-01, 1.839768e-17, + 1.678722e-01, -3.203184e-03, 7.736452e-01, 2.727852e+00, + 1.589646e-01, -3.824490e-02, 5.180550e-01, 7.756407e-01, + 4.521459e-01, 4.122442e-01}; + const static float cuda5_2_BACKWARD_DATA_alpha[6] = { + 1.933176e+08, 4.558126e+08, 6.040167e+07, 4.608431e+07, + 6.338093e+08, 3.281159e+08}; + const static float cuda5_2_BACKWARD_DATA_beta[6] = { + 1.608048e+00, 1.659768e+00, 1.943038e+00, 1.953083e+00, + 1.738348e+00, 1.891296e+00}; + + float cuda5_2_BACKWARD_FILTER_time_pred[6] = {0.0f}; + float cuda5_2_BACKWARD_FILTER_mask[6] = {0.0f}; + float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f}; + const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 6}; + const static float cuda5_2_BACKWARD_FILTER_matrices[324] = { + 4.047185e-03, 3.388695e-04, 1.210363e-04, -6.148457e-06, + -3.252271e-03, 8.122424e-04, 1.075851e-03, 3.066259e-03, + 1.921126e-03, -1.042791e-04, -3.275821e-01, 4.278608e-03, + -2.106100e-01, 8.295547e-02, 2.430674e-01, -2.748593e-02, + -2.065240e-02, -1.395731e-02, -3.491511e-02, 3.520847e-03, + 1.790237e-02, 1.188376e-02, 5.372314e-02, 1.494784e-02, + 5.035055e-02, 6.581915e-02, 6.861494e-02, -2.199881e-03, + -2.281682e-02, -9.687833e-02, 3.909182e-03, 1.024575e-01, + 3.948949e-02, -4.566963e-02, -1.375550e-01, -6.794923e-02, + 6.135985e-04, -4.608163e-01, 2.404660e-01, 6.274750e-03, + 1.059302e-01, 1.676516e-01, -5.104349e-02, 9.925397e-02, + -1.470984e-02, 1.031084e-04, 4.374801e-02, -5.167035e-01, + -3.632444e-01, 8.170792e-02, 3.783056e-01, 3.212413e-01, + -4.803373e-01, -4.874209e-01, 2.615676e-04, 3.406848e-02, + 8.674651e-02, 3.508870e-03, -6.156053e-01, 3.270718e-01, + 3.457363e-01, 1.898023e-01, -1.473479e-01, -2.987293e-01, + 1.315816e-03, -5.991638e-03, 1.428707e-03, 1.580944e-03, + 6.320467e-01, 2.342933e-01, -7.387988e-01, -4.437208e-01, + -7.261886e-02, 5.008508e-03, 4.693171e-02, -5.879956e-02, + 1.677305e-02, 1.845511e-01, 3.830231e-01, 4.003870e-02, + 9.888364e-02, 7.434040e-04, 7.895462e-02, 2.310843e-01, + 1.044731e-02, 1.716935e-01, 1.390186e-01, -3.862206e-01, + -1.001334e-01, 1.338546e-02, -1.354914e-02, 5.464492e-02, + 3.437773e-03, -2.069449e-03, -3.513253e-02, 1.837639e-02, + -1.552736e-01, -1.349904e-02, -1.025307e-01, -4.804826e-06, + 3.284197e-02, 5.086832e-02, 5.690669e-03, 7.154379e-02, + 1.094594e+00, 1.068281e+00, 3.653902e-01, 3.107198e-01, + -7.299128e-03, -3.042033e-04, 5.593516e-03, 3.541658e-03, + 5.810616e-04, 8.030201e-03, -1.622678e-02, 1.400076e-04, + 2.819623e-03, 4.108455e-03, 5.561182e-03, 2.512096e-03, + -8.622734e-04, 5.333219e-02, 3.076694e-02, 1.795766e-01, + -2.318845e-02, -3.202521e-02, 3.119619e-01, -1.606582e-01, + -1.085588e-01, -9.067213e-02, -1.422861e-02, -3.444208e-02, + -1.635176e-04, -2.596654e-01, 1.995525e-02, 2.055750e-02, + 2.022944e-01, 4.327365e-01, -1.619481e-02, 1.125397e-01, + 7.984060e-03, -2.073076e-01, -1.761664e-02, -4.832107e-02, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 1.638518e-03, -1.793951e-02, 5.772194e-02, 2.851987e-02, + 6.163087e-02, 2.867437e-02, 5.545961e-02, -1.660824e-01, + 9.789789e-02, -1.159482e-01, 5.385513e-02, 6.836513e-02, + 5.594874e-04, -2.741018e-02, 4.838353e-02, 4.298405e-02, + 1.854298e-02, 3.633871e-02, 9.942706e-03, 3.490340e-01, + 8.440907e-02, 2.376168e-02, 4.866724e-02, -2.214078e-01, + -5.650432e-03, -8.008064e-04, 1.477945e-03, 9.983850e-04, + 2.346494e-04, 2.069148e-03, -4.035380e-03, -5.895875e-03, + -2.146410e-04, 8.988932e-04, 7.378523e-05, -3.107871e-05, + 3.014900e-03, -2.577113e-01, 8.653076e-03, -2.681585e-02, + -5.089819e-02, -2.550743e-02, -3.467115e-02, 3.631677e-01, + -5.167207e-02, 9.202915e-02, -2.041105e-02, -1.355488e-01, + -4.411176e-03, 1.459578e-01, -1.287185e-02, -5.766148e-03, + -1.725510e-01, 1.716040e-01, -1.324064e-01, -1.831788e-01, + -4.434610e-02, -7.823753e-02, -2.463202e-02, 2.183346e-02, + 5.483676e-04, -7.481821e-02, -8.179377e-03, -3.340281e-02, + -2.679154e-03, -3.484565e-02, -4.761697e-02, -7.778479e-01, + -9.353197e-02, -1.011887e-01, -3.653892e-02, 3.624209e-01, + -2.063141e-03, -1.785554e-03, 5.357111e-02, -4.105966e-02, + 4.269572e-02, -1.532830e-01, 2.175374e-02, 1.304753e-01, + 5.400207e-02, -4.020891e-02, -2.284152e-02, 1.153921e-01, + 2.909448e-03, -1.312913e-02, -1.562593e-01, -1.018874e-01, + 7.121818e-03, -1.468466e-01, 3.900497e-03, -2.249627e-02, + -5.684932e-02, 2.612863e-02, -1.410081e-01, 2.298795e-02, + 6.794739e-04, 7.064358e-01, 7.429705e-01, 0.0, + 3.578874e-01, 7.372183e-01, -2.632545e-04, -1.001730e-01, + 4.224807e-01, -1.673518e-01, 9.987204e-04, -7.437595e-02, + 4.765817e-05, 5.283366e-01, 5.804700e-01, 0.0, + 5.304079e-01, 8.826514e-01, 1.191588e-04, -2.403303e-02, + 8.384521e-02, -1.913135e-01, -2.046290e-04, -4.905949e-02, + -5.695952e-03, 4.907159e-01, 8.468218e-01, 0.0, + 3.835697e-01, 4.161280e-01, -1.292199e-03, 2.594048e-01, + 4.049456e-01, -4.400651e-01, 4.166223e-01, -1.978285e-01, + 2.546945e-04, 5.691357e-01, 7.418897e-01, 0.0, + 5.059269e-01, 8.695604e-01, -4.737849e-05, -1.666739e-02, + 1.190503e-01, -1.528916e-01, -1.769190e-04, -4.045478e-02, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 3.728615e-02, 3.964641e-01, 6.608990e-01, 0.0, + 6.230336e-01, 5.074117e-01, 8.405939e-03, -1.422498e-01, + 4.705996e-02, -2.407855e-01, -2.056813e-02, 2.624028e-01}; + const static float cuda5_2_BACKWARD_FILTER_biases[30] = { + 2.526327e-03, 1.731556e-02, 7.836947e-02, 6.594411e-02, + 1.693102e-01, 7.814206e-01, 6.354987e-01, 9.766987e-01, + 2.128775e-01, -4.894586e-01, -3.741650e-02, -1.046441e-01, + -2.802074e-02, 1.076976e+00, 1.484343e+00, 0.0, + 1.488592e+00, 2.316875e+00, -5.133961e-03, 3.100583e-01, + 6.346995e-01, 3.810246e-01, -2.523698e-01, 3.231826e-01, + -4.245956e-01, -4.564983e-01, 4.500998e-03, -5.841292e-01, + 0.0, -1.728347e-01}; + const static float cuda5_2_BACKWARD_FILTER_alpha[6] = { + 2.066506e+08, 2.177061e+08, 5.654493e+07, 2.368001e+08, + 2.000000e+08, 2.537848e+08}; + const static float cuda5_2_BACKWARD_FILTER_beta[6] = { + 1.610186e+00, 1.844894e+00, 1.895551e+00, 1.816587e+00, + 2.000000e+00, 2.252824e+00}; + + if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 && + cuda_minor == 2) { + *layer_num_p = 4; + *hidden_units_p = cuda5_2_FORWARD_hidden_units; + *layers_dim_p = cuda5_2_FORWARD_layers_dim; + *matrices_p = cuda5_2_FORWARD_matrices; + *biases_p = cuda5_2_FORWARD_biases; + *alpha_p = cuda5_2_FORWARD_alpha; + *beta_p = cuda5_2_FORWARD_beta; + *time_pred_p = cuda5_2_FORWARD_time_pred; + *mask_p = cuda5_2_FORWARD_mask; + } else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 && + cuda_minor == 2) { + *layer_num_p = 4; + *hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units; + *layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim; + *matrices_p = cuda5_2_BACKWARD_DATA_matrices; + *biases_p = cuda5_2_BACKWARD_DATA_biases; + *alpha_p = cuda5_2_BACKWARD_DATA_alpha; + *beta_p = cuda5_2_BACKWARD_DATA_beta; + *time_pred_p = cuda5_2_BACKWARD_DATA_time_pred; + *mask_p = cuda5_2_BACKWARD_DATA_mask; + } else if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 && + cuda_minor == 2) { + *layer_num_p = 4; + *hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units; + *layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim; + *matrices_p = cuda5_2_BACKWARD_FILTER_matrices; + *biases_p = cuda5_2_BACKWARD_FILTER_biases; + *alpha_p = cuda5_2_BACKWARD_FILTER_alpha; + *beta_p = cuda5_2_BACKWARD_FILTER_beta; + *time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred; + *mask_p = cuda5_2_BACKWARD_FILTER_mask; + } else { + return false; + } + return true; +#endif + + return false; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/helper.cpp b/dnn/src/cuda/convolution/helper.cpp new file mode 100644 index 00000000..807df29e --- /dev/null +++ b/dnn/src/cuda/convolution/helper.cpp @@ -0,0 +1,85 @@ +/** + * \file dnn/src/cuda/convolution/helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./helper.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { + + // CUDNN_STATUS_EXECUTION_FAILED on Tegra K1, so disable CUDNN + // on Tegra K1. + if (args.handle->is_tegra_k1()) + return false; + + // TODO: We only support NCHW format now. It seems cuDNN provides support + // for NHWC as well. + if (args.filter_meta.format == param::Convolution::Format::NCHW4) { + if (args.dst_layout->dtype.enumv() != DTypeEnum::Int8 && + args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { + return false; + } + } else if (args.filter_meta.format != param::Convolution::Format::NCHW) { + return false; + } + auto& fm = args.filter_meta; + bool supported = true; + supported &= (fm.spatial_ndim == 2); +#if CUDNN_VERSION < 7000 + supported &= (fm.group == 1); +#endif +#if CUDNN_VERSION < 7500 + supported &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); +#endif + return supported; +} + +WorkspaceBundle convolution::matmul_get_workspace_bundle( + const ForwardSizeArgs &args) { + auto dtype = args.src_layout->dtype; + auto &&fm = args.filter_meta; + megdnn_assert(fm.group == 1); + auto N = args.src_layout->shape[0]; + auto OC = fm.ocpg, + IC = fm.icpg, + FH = fm.spatial[0], + FW = fm.spatial[1]; + auto OH = args.dst_layout->shape[2], + OW = args.dst_layout->shape[3]; + SmallVector sizes{ + dtype.size() * args.dst_layout->total_nr_elems(), + dtype.size() * IC*FH*FW*OH*OW*N + }; + if (args.filter_meta.should_flip) { + sizes.push_back(dtype.size() * OC * IC * FH * FW); + } + return {nullptr, std::move(sizes)}; +} + +void convolution::flip_filter(const ForwardSizeArgs &args, + const Workspace &workspace, void *&raw_ptr) { + auto &&fm = args.filter_meta; + megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2); + auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; + auto dtype = fm.dtype; + megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW); + + TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}}, + dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout}; + dst.layout.stride[2] = -dst.layout.stride[2]; + dst.layout.stride[3] = -dst.layout.stride[3]; + args.handle->relayout_opr()->exec(src, dst); + raw_ptr = workspace.raw_ptr; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/helper.h b/dnn/src/cuda/convolution/helper.h new file mode 100644 index 00000000..e61449dc --- /dev/null +++ b/dnn/src/cuda/convolution/helper.h @@ -0,0 +1,99 @@ +/** + * \file dnn/src/cuda/convolution/helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "./opr_impl.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/handle.h" +#include "src/common/utils.h" +#include "src/common/algo_chooser.h" + +namespace megdnn { +namespace cuda { +namespace convolution { + using CanonizedFilterMeta = ConvolutionForward::CanonizedFilterMeta; + + //! conv size descriptor in the forward view + struct ForwardSizeArgs { + HandleImpl *handle; + const TensorLayout *src_layout; + CanonizedFilterMeta filter_meta; + const TensorLayout *dst_layout; + }; + + //! whether cudnn is supported for a filter meta + bool is_cudnn_supported(const ForwardSizeArgs &args); + + //! get workspace bundle for matmul algo + WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs &args); + + struct CUDNNForwardDescs { + TensorDesc src_desc, dst_desc; + FilterDesc filter_desc; + ConvDesc conv_desc; + void set(const TensorLayout &src, + const CanonizedFilterMeta &filter, + const TensorLayout &dst, + const param::Convolution ¶m) + { + src_desc.set(src, param.format); + filter_desc.set(filter); + dst_desc.set(dst, param.format); + conv_desc.set(src.dtype, param, filter.group); + } + }; + + struct CUDNNBwdDataDescs { + TensorDesc diff_desc, grad_desc; + FilterDesc filter_desc; + ConvDesc conv_desc; + void set(const CanonizedFilterMeta &filter, + const TensorLayout &diff, + const TensorLayout &grad, + const param::Convolution ¶m) + { + filter_desc.set(filter); + diff_desc.set(diff, param.format); + grad_desc.set(grad, param.format); + conv_desc.set(filter.dtype, param, filter.group); + } + }; + + struct CUDNNBwdFilterDescs { + TensorDesc diff_desc, src_desc; + FilterDesc grad_desc; + ConvDesc conv_desc; + void set(const TensorLayout &src, + const TensorLayout &diff, + const CanonizedFilterMeta &grad, + const param::Convolution ¶m) + { + src_desc.set(src, param.format); + diff_desc.set(diff, param.format); + grad_desc.set(grad); + conv_desc.set(src.dtype, param, grad.group); + } + }; + + /*! + * \brief flip conv filter + * + * Flip conv filter pointed by \p raw_ptr, store result in workspace, and + * change \p raw_ptr to workspace. + */ + void flip_filter(const ForwardSizeArgs &args, + const Workspace &workspace, void *&raw_ptr); + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/im2col.cu b/dnn/src/cuda/convolution/im2col.cu new file mode 100644 index 00000000..fcabab69 --- /dev/null +++ b/dnn/src/cuda/convolution/im2col.cu @@ -0,0 +1,168 @@ +/** + * \file dnn/src/cuda/convolution/im2col.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./im2col.cuh" +#include "src/cuda/utils.cuh" +#include "megdnn/dtype.h" + +using namespace megdnn; +using namespace cuda; + +namespace { + +template +__global__ void im2col_kernel(const T *im, T *col, + uint32_t N, uint32_t INP_BS, + uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t FH, uint32_t FW, + uint32_t OH, uint32_t OW, + uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, + uint32_t DH, uint32_t DW) +{ + uint32_t n = threadIdx.x + blockIdx.y * blockDim.x; + uint32_t ow = threadIdx.y + blockIdx.z * blockDim.y; + uint32_t oh = blockIdx.x % OH; + uint32_t fw = blockIdx.x / OH % FW; + uint32_t fh = blockIdx.x / OH / FW % FH; + uint32_t ic = blockIdx.x / OH / FW / FH; + if (n < N && ow < OW) { + uint32_t didx = blockIdx.x * OW*N + ow*N + n; + uint32_t ih = -PH + oh*SH + fh*DH; + uint32_t iw = -PW + ow*SW + fw*DW; + col[didx] = (ih < IH && iw < IW ? + im[n*INP_BS + ic*IH*IW + ih*IW + iw] : T(0.0f)); + } +} + +template +__global__ void col2im_kernel(const T *col, T *im, + uint32_t N, uint32_t INP_BS, + uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t FH, uint32_t FW, + uint32_t OH, uint32_t OW, + uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, + uint32_t DH, uint32_t DW) +{ + uint32_t iw = threadIdx.x + blockIdx.y * blockDim.x; + uint32_t ih = threadIdx.y + blockIdx.z * blockDim.y; + uint32_t ic = blockIdx.x % IC; + uint32_t n = blockIdx.x / IC; + if (iw < IW && ih < IH) { + T res(0); + // ih = -ph + oh*sh + fh*dh + // ih + ph - fh*dh == oh*sh + for (uint32_t fh = 0; fh < FH; ++fh) { + uint32_t anchorh = ih + PH - fh*DH; + if (anchorh < OH*SH && anchorh % SH == 0) { + uint32_t oh = anchorh / SH; + for (uint32_t fw = 0; fw < FW; ++fw) { + uint32_t anchorw = iw + PW - fw*DW; + if (anchorw < OW*SW && anchorw % SW == 0) { + uint32_t ow = anchorw / SW; + res += col[ic*FH*FW*OH*OW*N + + fh*FW*OH*OW*N + + fw*OH*OW*N + + oh*OW*N + + ow*N + + n]; + } + } + } + } + im[n*INP_BS + ic*IH*IW + ih*IW + iw] = res; + } +} + +} // anonymous namespace + +template +void convolution::im2col(const T *im, T *col, + size_t N, size_t INP_BS, + size_t IC, size_t IH, size_t IW, + size_t FH, size_t FW, + size_t OH, size_t OW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t DH, size_t DW, + cudaStream_t stream) +{ + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + // dim3 blocks(DIVUP(N, NR_THREADS_X), DIVUP(OW, NR_THREADS_Y), IC*FH*FW*OH); + // IC*FH*FW*OH can be larger than 65536; shuffling blocks dimensions to + // put IC*FH*FW*OH to the first dimension. + dim3 blocks(IC*FH*FW*OH, DIVUP(N, NR_THREADS_X), DIVUP(OW, NR_THREADS_Y)); + im2col_kernel<<>>(im, col, + N, INP_BS, + IC, IH, IW, FH, FW, OH, OW, + PH, PW, SH, SW, DH, DW); + after_kernel_launch(); +} + +template +void convolution::col2im(const T *col, T *im, + size_t N, size_t INP_BS, + size_t IC, size_t IH, size_t IW, + size_t FH, size_t FW, + size_t OH, size_t OW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t DH, size_t DW, + cudaStream_t stream) +{ + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + // (x, y, z) is shuffled to (y, z, x) to bypass CUDA launch shape limitation. + // dim3 blocks(DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y), N*IC); + dim3 blocks(N*IC, DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y)); + col2im_kernel<<>>(col, im, + N, INP_BS, + IC, IH, IW, FH, FW, OH, OW, + PH, PW, SH, SW, DH, DW); + after_kernel_launch(); +} + + +namespace megdnn { +namespace cuda { +namespace convolution { + +#define DO_INST(T) \ +template void im2col(const T *im, T *col, \ + size_t N, size_t INP_BS, \ + size_t IC, size_t IH, size_t IW, \ + size_t FH, size_t FW, \ + size_t OH, size_t OW, \ + size_t PH, size_t PW, \ + size_t SH, size_t SW, \ + size_t DH, size_t DW, \ + cudaStream_t stream); \ +template void col2im(const T *col, T *im, \ + size_t N, size_t INP_BS, \ + size_t IC, size_t IH, size_t IW, \ + size_t FH, size_t FW, \ + size_t OH, size_t OW, \ + size_t PH, size_t PW, \ + size_t SH, size_t SW, \ + size_t DH, size_t DW, \ + cudaStream_t stream); + +#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST); + +#undef DO_INST +#undef INST + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/im2col.cuh b/dnn/src/cuda/convolution/im2col.cuh new file mode 100644 index 00000000..7a7c9e0b --- /dev/null +++ b/dnn/src/cuda/convolution/im2col.cuh @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/convolution/im2col.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include + +namespace megdnn { +namespace cuda { +namespace convolution { + +//! col is of shape (ic*fh*fw, oh*ow*n) +template +void im2col(const T *im, T *col, + size_t N, size_t INP_BS, + size_t IC, size_t IH, size_t IW, + size_t FH, size_t FW, + size_t OH, size_t OW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t DH, size_t DW, // dilation + cudaStream_t stream); + +template +void col2im(const T *col, T *im, + size_t N, size_t INP_BS, + size_t IC, size_t IH, size_t IW, + size_t FH, size_t FW, + size_t OH, size_t OW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t DH, size_t DW, // dilation + cudaStream_t stream); + +} // namespace dilated_convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp new file mode 100644 index 00000000..3558dfaa --- /dev/null +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -0,0 +1,376 @@ +/** + * \file dnn/src/cuda/convolution/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/convolution/opr_impl.h" +#include "src/cuda/convolution/helper.h" +#include "src/cuda/convolution/backward_data/algo.h" +#include "src/cuda/convolution/backward_filter/algo.h" +#include "src/cuda/conv_bias/opr_impl.h" + +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#define TO_STRING2(v) #v +#define TO_STRING(v) TO_STRING2(v) +#define CUDNN_VERSION_STR TO_STRING(CUDNN_MAJOR) "." \ + TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) + +/* ============== ConvolutionForwardImpl ============== */ +ConvolutionForwardImpl::ConvBiasExtraData +ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& dst) { + auto conv_param = param(); + ConvBiasExtraData ret = {this->handle()->create_operator(), + TensorLayout(dst.dtype), TensorLayout(dst.dtype)}; + ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY, + conv_param.mode, + conv_param.sparse, + conv_param.format, + conv_param.pad_h, + conv_param.pad_w, + conv_param.stride_h, + conv_param.stride_w, + conv_param.dilate_h, + conv_param.dilate_w, + 0, + conv_param.compute_mode}; + ret.convbias_opr->execution_policy() = {this->execution_policy().algorithm}; + return ret; +} + +ConvolutionForwardImpl::Algorithm* +ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) { + auto extra_data = conv_bias_extra_data(dst); + return static_cast(extra_data.convbias_opr.get()) + ->get_algorithm_heuristic(src, filter, extra_data.bias_layout, + extra_data.z_layout, dst, + workspace_limit_in_bytes, reproducible); +} + +std::vector +ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) { + auto extra_data = conv_bias_extra_data(dst); + return static_cast(extra_data.convbias_opr.get()) + ->get_all_algorithms(src, filter, extra_data.bias_layout, + extra_data.z_layout, dst); +} + +size_t ConvolutionForwardImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) { + auto extra_data = conv_bias_extra_data(dst); + return static_cast(extra_data.convbias_opr.get()) + ->get_workspace_in_bytes(src, filter, extra_data.bias_layout, + extra_data.z_layout, dst); +} + +void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + auto extra_data = conv_bias_extra_data(dst.layout); + TensorND bias(nullptr, extra_data.bias_layout); + TensorND z(nullptr, extra_data.z_layout); + return static_cast(extra_data.convbias_opr.get()) + ->exec(src, filter, bias, z, dst, workspace); +} + +const char* ConvolutionForwardImpl::get_algorithm_set_name() const { + return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; +} + +/* ============== ConvolutionBackwardDataImpl ============== */ + +void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); + auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout); + algo->check_workspace(args, workspace).exec(args); +} + +std::vector +ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) { + return megdnn::get_all_algorithms( + {this, filter, diff, grad}); +} + +ConvolutionBackwardDataImpl::Algorithm* +ConvolutionBackwardDataImpl::get_algorithm_heuristic( + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + auto fm = check_layout_fwd(grad, filter, diff); + return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes, + reproducible); +} + +ConvolutionBackwardDataImpl::Algorithm* +ConvolutionBackwardDataImpl::get_algorithm_heuristic( + const CanonizedFilterMeta& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, filter, diff, grad); + + if (args.filter_meta.group > 1 && + sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + // prefer special chanwise impl + return &sm_algo_pack.chanwise; + } + + auto get_cudnn_algo = + [this, &args, workspace_limit_in_bytes, + reproducible]() -> ConvolutionBackwardDataImpl::AlgoBase* { + auto cudnn_handle = cuda::cudnn_handle(this->handle()); + CUDNNBwdDataDescs desc; + args.init_desc(desc); + + //disable, segfault in megbrain, need further investigate. +#if 0 + bool is_heuristic_success= convolution:: + PerformanceModelBackwardData::get_algo_backward_data_success( + args, desc, workspace_limit_in_bytes, &algo); + if (is_heuristic_success) { + return sm_algo_pack.cudnn_from_enum(algo); + } +#endif +#if CUDNN_MAJOR >= 7 + int max_count = 0; + cudnn_check(cudnnGetConvolutionBackwardDataAlgorithmMaxCount( + cudnn_handle, &max_count)); + SmallVector algo_perf(max_count); + int ret_count = 0; + cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7( + cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc, + desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count, + algo_perf.data())); + for (int i = 0; i < ret_count; ++i) { + if (algo_perf[i].memory > workspace_limit_in_bytes) + continue; + if (reproducible) { + if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) { + return reinterpret_cast( + sm_algo_pack.cudnn_from_enum(algo_perf[i].algo)); + } + } else { + return reinterpret_cast( + sm_algo_pack.cudnn_from_enum(algo_perf[i].algo)); + } + } + return nullptr; +#else + cudnnConvolutionBwdDataAlgo_t algo; + cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm( + cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc, + desc.conv_desc.desc, desc.grad_desc.desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_in_bytes, &algo)); + auto&& cast_algo = + reinterpret_cast(sm_algo_pack.cudnn_from_enum(algo)); + return reinterpret_cast( + megdnn::get_reproducible_algo( + cast_algo, reproducible)); +#endif + }; + + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return algo; + } + + if (args.filter_meta.group > 1) { + auto orig_args = args; + TensorLayout a, b; + AlgoGroupConvGeneral::modify_size_args(args, a, b); + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return sm_algo_pack.algo2gconv.at(algo); + } + args = orig_args; + } + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_data"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_data"); + } +} + +size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( + const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) { + AlgoBase::SizeArgs args(this, filter, diff, grad); + return get_algorithm(this, args.filter_meta, diff, grad)-> + get_workspace_in_bytes(args); +} + +const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { + return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; +} + +/* ============== ConvolutionBackwardFilterImpl ============== */ + +void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, src, diff, grad, workspace); + auto algo = get_algorithm(this, src.layout, diff.layout, + args.grad_filter_meta); + algo->check_workspace(args, workspace).exec(args); +} + +std::vector +ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) { + return megdnn::get_all_algorithms( + {this, src, diff, grad}); +} + +ConvolutionBackwardFilterImpl::Algorithm* +ConvolutionBackwardFilterImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + auto fm = check_layout_fwd(src, grad, diff); + return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes, + reproducible); +} + +ConvolutionBackwardFilterImpl::Algorithm* +ConvolutionBackwardFilterImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& diff, + const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, src, diff, grad); + + if (args.grad_filter_meta.group > 1 && + sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + // prefer special chanwise impl + return &sm_algo_pack.chanwise; + } + + auto get_cudnn_algo = + [this, &args, workspace_limit_in_bytes, + reproducible]() -> ConvolutionBackwardFilterImpl::AlgoBase* { + auto cudnn_handle = cuda::cudnn_handle(this->handle()); + CUDNNBwdFilterDescs desc; + args.init_desc(desc); + + //disable, segfault in megbrain, need further investigate. +#if 0 + auto is_heuristic_success = + convolution::PerformanceModelBackwardFilter:: + get_algo_backward_filter_success( + args, desc, workspace_limit_in_bytes, &algo); + if (is_heuristic_success) { + return sm_algo_pack.cudnn_from_enum(algo); + } +#endif +#if CUDNN_MAJOR >= 7 + int max_count = 0; + cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + cudnn_handle, &max_count)); + SmallVector algo_perf(max_count); + int ret_count = 0; + cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm_v7( + cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc, + desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count, + algo_perf.data())); + for (int i = 0; i < ret_count; ++i) { + if (algo_perf[i].memory > workspace_limit_in_bytes) + continue; + if (reproducible) { + if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) { + return reinterpret_cast( + sm_algo_pack.cudnn_from_enum(algo_perf[i].algo)); + } + } else { + return reinterpret_cast( + sm_algo_pack.cudnn_from_enum(algo_perf[i].algo)); + } + } + return nullptr; +#else + cudnnConvolutionBwdFilterAlgo_t algo; + cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm( + cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc, + desc.conv_desc.desc, desc.grad_desc.desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_in_bytes, &algo)); + auto&& cast_algo = + reinterpret_cast(sm_algo_pack.cudnn_from_enum(algo)); + return reinterpret_cast( + megdnn::get_reproducible_algo( + cast_algo, reproducible)); +#endif + }; + + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return algo; + } + + if (args.grad_filter_meta.group > 1) { + auto orig_args = args; + TensorLayout a, b; + AlgoGroupConvGeneral::modify_size_args(args, a, b); + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return sm_algo_pack.algo2gconv.at(algo); + } + args = orig_args; + } + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_filter"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_filter"); + } +} + +size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( + const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) { + AlgoBase::SizeArgs args(this, src, diff, grad); + return get_algorithm(this, src, diff, args.grad_filter_meta)-> + get_workspace_in_bytes(args); +} + +const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const { + return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/opr_impl.h b/dnn/src/cuda/convolution/opr_impl.h new file mode 100644 index 00000000..393bd9d5 --- /dev/null +++ b/dnn/src/cuda/convolution/opr_impl.h @@ -0,0 +1,134 @@ +/** + * \file dnn/src/cuda/convolution/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs/nn.h" + +namespace megdnn { +namespace cuda { + +class ConvolutionForwardImpl: public ConvolutionForward { + public: + using ConvolutionForward::ConvolutionForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + std::vector get_all_algorithms(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) override; + const char* get_algorithm_set_name() const override; + + protected: + struct ConvBiasExtraData{ + std::unique_ptr convbias_opr; + TensorLayout bias_layout; + TensorLayout z_layout; + }; + private: + ConvBiasExtraData conv_bias_extra_data(const TensorLayout&); +}; + +class ConvolutionBackwardDataImpl: public ConvolutionBackwardData { + public: + using ConvolutionBackwardData::ConvolutionBackwardData; + void exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + std::vector get_all_algorithms(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + Algorithm* get_algorithm_heuristic(const CanonizedFilterMeta& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible); + size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoCUDNN; + class AlgoMatmul; + class AlgoChanwise; + class AlgoChanwiseSmall; + class AlgoGroupConvGeneral; + + class AlgoPack; + + static const AlgoPack& algo_pack() { + return sm_algo_pack; + } + + private: + static AlgoPack sm_algo_pack; +}; + +class ConvolutionBackwardFilterImpl: public ConvolutionBackwardFilter { + public: + using ConvolutionBackwardFilter::ConvolutionBackwardFilter; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + std::vector get_all_algorithms(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& diff, + const CanonizedFilterMeta& grad, + size_t workspace_limit_in_bytes, + bool reproducible); + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoCUDNN; + class AlgoMatmul; + class AlgoChanwise; + class AlgoGroupConvGeneral; + + class AlgoPack; + + static const AlgoPack& algo_pack() { + return sm_algo_pack; + } + + private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_data/algo.cpp b/dnn/src/cuda/convolution3d/backward_data/algo.cpp new file mode 100644 index 00000000..9c243c42 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_data/algo.cpp @@ -0,0 +1,106 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_data/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +Convolution3DBackwardDataImpl::AlgoPack::AlgoPack() { + non_cudnn_algos.push_back(&chanwise); + + all_algos.push_back(&chanwise); // prefer chanwise + + fill_cudnn_algos(); + for (auto &&i: cudnn) { + all_algos.push_back(&i); + } + + all_algos.reserve(all_algos.size() * 2); + + // add gconv algos by AlgoGroupConvGeneral + auto all_algos_data = all_algos.data(); + for (size_t i = 1; i < all_algos.size(); ++ i) { + gconv.push_back({all_algos[i]}); + } + for (size_t i = 1; i < all_algos.size(); ++ i) { + algo2gconv[all_algos[i]] = &gconv[i - 1]; + } + for (auto &&i: gconv) { + all_algos.push_back(&i); + } + megdnn_assert(all_algos_data == all_algos.data()); +} + +Convolution3DBackwardDataImpl::AlgoCUDNN* +Convolution3DBackwardDataImpl::AlgoPack::cudnn_from_enum( + cudnnConvolutionBwdDataAlgo_t algo) { + for (auto &&i: cudnn) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw(megdnn_mangle(ssprintf( + "can not find cudnn bwd_data algorithm %d", + static_cast(algo)))); +} + +Convolution3DBackwardDataImpl::AlgoPack Convolution3DBackwardDataImpl::sm_algo_pack; + +Convolution3DBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( + Convolution3DBackwardDataImpl *o, + const TensorLayout &filter, const TensorLayout &diff, + const TensorLayout &grad): + SizeArgs(o, o->check_layout_fwd(grad, filter, diff), diff, grad) +{ +} + +Convolution3DBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( + Convolution3DBackwardDataImpl *o, + const CanonizedFilterMeta &filter, const TensorLayout &diff, + const TensorLayout &grad): + handle{concrete_handle(o->handle())}, + filter_meta{filter}, + diff_layout{&diff}, + grad_layout{&grad}, + opr{o} +{ +} + +Convolution3DBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( + Convolution3DBackwardDataImpl *opr, + _megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace): + SizeArgs(opr, filter.layout, diff.layout, grad.layout), + filter_tensor{&filter}, diff_tensor{&diff}, grad_tensor{&grad}, + workspace{workspace} +{ +} + +std::string Convolution3DBackwardDataImpl::AlgoBase::SizeArgs::to_string() const { + auto &&fm = filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return megdnn_mangle(ssprintf( + "filter=%u{%u,%u,%u,%u,%u}, diff=%s, grad=%s, " + "pad=%ux%ux%u, stride=%ux%ux%u, dilate=%ux%ux%u, xcorr=%d, dtype=%s,%s", + fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], fm.spatial[2], + diff_layout->to_string().c_str(), + grad_layout->to_string().c_str(), + fm.padding[0], fm.padding[1], fm.padding[2], + fm.stride[0], fm.stride[1], fm.stride[2], + fm.dilation[0], fm.dilation[1] ,fm.dilation[2], + !fm.should_flip, + diff_layout->dtype.name(), grad_layout->dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_data/algo.h b/dnn/src/cuda/convolution3d/backward_data/algo.h new file mode 100644 index 00000000..56a495d9 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_data/algo.h @@ -0,0 +1,191 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_data/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/convolution3d/helper.h" +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for convolution3d algos + * + * All the algo impls should try to support non-contiguous batch dim, for group + * conv execution. + */ +class Convolution3DBackwardDataImpl::AlgoBase: public Algorithm { + protected: + ~AlgoBase() = default; + + public: + struct SizeArgs { + HandleImpl *handle; + CanonizedFilterMeta filter_meta; + const TensorLayout *diff_layout, *grad_layout; + Convolution3DBackwardDataImpl *opr; + + std::string to_string() const; + void init_desc(convolution3d::CUDNNBwdDataDescs &desc) const { + desc.set(filter_meta, *diff_layout, *grad_layout, opr->param()); + } + SizeArgs(Convolution3DBackwardDataImpl *opr, + const TensorLayout &filter, const TensorLayout &diff, + const TensorLayout &grad); + SizeArgs(Convolution3DBackwardDataImpl *opr, + const CanonizedFilterMeta &filter, const TensorLayout &diff, + const TensorLayout &grad); + + convolution3d::ForwardSizeArgs as_fwd_args() const { + return {handle, grad_layout, filter_meta, diff_layout, + opr->param().data_type}; + } + }; + struct ExecArgs: public SizeArgs { + const TensorND *filter_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(Convolution3DBackwardDataImpl *opr, + _megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs &args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs &args) const = 0; + virtual void exec(const ExecArgs &args) const = 0; + + bool is_available_wk(const SizeArgs &args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace( + const SizeArgs &args, const Workspace &workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "conv bwd data algo %s: " + "required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + virtual bool is_cudnn() const { + return false; + } +}; + +class Convolution3DBackwardDataImpl::AlgoCUDNN final : public AlgoBase { + bool m_is_reproducible; + const char *m_name; + cudnnConvolutionBwdDataAlgo_t m_cudnn_enum; + + public: + + AlgoCUDNN(bool is_reproducible, const char *name, + cudnnConvolutionBwdDataAlgo_t cudnn_enum): + m_is_reproducible(is_reproducible), + m_name(name), + m_cudnn_enum(cudnn_enum) + {} + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + bool is_reproducible() const override { + return m_is_reproducible; + } + + const char* name() const override { + return m_name; + } + + cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { + return m_cudnn_enum; + } + + bool is_cudnn() const override { + return true; + } +}; + +class Convolution3DBackwardDataImpl::AlgoChanwise final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "CHANNEL_WISE"; + } + bool is_reproducible() const override { + return true; + } +}; + +//! implement group conv by another algo +class Convolution3DBackwardDataImpl::AlgoGroupConvGeneral final: public AlgoBase { + AlgoBase *m_impl; + std::string m_name; + + public: + AlgoGroupConvGeneral(AlgoBase *impl); + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return m_name.c_str(); + } + + bool is_reproducible() const override { + return m_impl->is_reproducible(); + } + + static void modify_size_args(SizeArgs &args, + TensorLayout &diff_pg, TensorLayout &grad_pg); +}; + +class Convolution3DBackwardDataImpl::AlgoPack { + // defined in cudnn.cpp + void fill_cudnn_algos(); + + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator = (const AlgoPack &) = delete; + + public: + AlgoPack(); + + std::vector cudnn; + AlgoChanwise chanwise; + std::vector gconv; + std::unordered_map algo2gconv; + + std::vector + //! all algorithms + all_algos, + //! non-cudnn algos, used for heuristic if cudnn is not supported + non_cudnn_algos; + + AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_data/chanwise.cpp b/dnn/src/cuda/convolution3d/backward_data/chanwise.cpp new file mode 100644 index 00000000..dafe1e6c --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_data/chanwise.cpp @@ -0,0 +1,59 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_data/chanwise.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution3d/chanwise/kern.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DBackwardDataImpl::AlgoChanwise::is_available( + const SizeArgs &args) const { + auto &&fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCDHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + fm.spatial_ndim == 3 && fm.icpg == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && + fm.dilation[2] == 1 && + !fm.should_flip; +} + +size_t Convolution3DBackwardDataImpl::AlgoChanwise::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void Convolution3DBackwardDataImpl::AlgoChanwise::exec( + const ExecArgs &args) const { + auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); + auto stream = cuda_stream(args.handle); + switch (args.diff_layout->dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + { \ + using ctype = DTypeTrait<_dt>::ctype; \ + return chanwise::run_bwd_data( \ + args.grad_tensor->ptr(), \ + args.diff_tensor->ptr(), \ + args.filter_tensor->ptr(), \ + kparam, stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + default: + break; + } + megdnn_assert_internal(0); +} +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp b/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp new file mode 100644 index 00000000..01caa236 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_data/cudnn.cpp @@ -0,0 +1,106 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_data/cudnn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +#include "src/cuda/utils.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/convolution3d/helper.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DBackwardDataImpl::AlgoCUDNN::is_available( + const SizeArgs &args) const { + CUDNNBwdDataDescs D; + + if (!is_cudnn_supported(args.as_fwd_args())) + return false; + + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle->cudnn_handle(), + D.filter_desc.desc, + D.diff_desc.desc, + D.conv_desc.desc, + D.grad_desc.desc, + m_cudnn_enum, + &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +size_t Convolution3DBackwardDataImpl::AlgoCUDNN::get_workspace_in_bytes( + const SizeArgs &args) const { + CUDNNBwdDataDescs D; + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle->cudnn_handle(), + D.filter_desc.desc, + D.diff_desc.desc, + D.conv_desc.desc, + D.grad_desc.desc, + m_cudnn_enum, + &workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_data get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + return workspace_size; +} + +void Convolution3DBackwardDataImpl::AlgoCUDNN::exec( + const ExecArgs &args) const { + CUDNNBwdDataDescs D; + args.init_desc(D); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionBackwardData(args.handle->cudnn_handle(), + &alpha, + D.filter_desc.desc, args.filter_tensor->raw_ptr, + D.diff_desc.desc, args.diff_tensor->raw_ptr, + D.conv_desc.desc, + m_cudnn_enum, + args.workspace.raw_ptr, + args.workspace.size, + &beta, + D.grad_desc.desc, + args.grad_tensor->raw_ptr); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_data failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); +} + +void Convolution3DBackwardDataImpl::AlgoPack::fill_cudnn_algos() { +#define V1(v) #v +#define V(v) V1(v) + +#define DEF_ALGO(NAME, REPROD) \ + cudnn.push_back({ \ + REPROD, #NAME \ + "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) \ + "." V(CUDNN_PATCHLEVEL), \ + NAME}) + +DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false); +DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true); +DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true); +#if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) +#pragma message "not latest cudnn" +#endif + +#undef DEF_ALGO + +#undef V +#undef V1 +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp b/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp new file mode 100644 index 00000000..e2e992b6 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_data/group_conv.cpp @@ -0,0 +1,82 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_data/group_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +void Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( + Convolution3DBackwardDataImpl::AlgoBase::SizeArgs &args, + TensorLayout &diff_pg, TensorLayout &grad_pg) { + diff_pg = *args.diff_layout; + grad_pg = *args.grad_layout; + auto nr_grp = args.filter_meta.group; + args.filter_meta.group = 1; + diff_pg.shape[1] /= nr_grp; + grad_pg.shape[1] /= nr_grp; + args.diff_layout = &diff_pg; + args.grad_layout = &grad_pg; +} + +Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( + AlgoBase *impl): + m_impl{impl} +{ + m_name = "group_conv3d:"; + m_name += impl->name(); +} + +bool Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::is_available( + const SizeArgs &args) const { + auto sub_args = args; + TensorLayout diff_pg, grad_pg; + modify_size_args(sub_args, diff_pg, grad_pg); + return m_impl->is_available(sub_args); +} + +size_t Convolution3DBackwardDataImpl::AlgoGroupConvGeneral:: +get_workspace_in_bytes(const SizeArgs &args) const { + auto sub_args = args; + TensorLayout diff_pg, grad_pg; + modify_size_args(sub_args, diff_pg, grad_pg); + return m_impl->get_workspace_in_bytes(sub_args); +} + +void Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::exec( + const ExecArgs &args) const { + auto sub_args = args; + TensorND tflt{*args.filter_tensor}, tdiff{*args.diff_tensor}, + tgrad{*args.grad_tensor}; + modify_size_args(sub_args, tdiff.layout, tgrad.layout); + sub_args.filter_tensor = &tflt; + sub_args.diff_tensor = &tdiff; + sub_args.grad_tensor = &tgrad; + auto grp = args.filter_meta.group; + + auto &&fm = args.filter_meta; + auto strd_flt = (fm.icpg * fm.ocpg * + fm.spatial[0] * fm.spatial[1] * fm.spatial[2] * tflt.layout.dtype.size()), + strd_diff = ( + tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), + strd_grad = ( + tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); + for (uint32_t g = 0; g < grp; ++ g) { + m_impl->exec(sub_args); + incr_voidp(tflt.raw_ptr, strd_flt); + incr_voidp(tdiff.raw_ptr, strd_diff); + incr_voidp(tgrad.raw_ptr, strd_grad); + } +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/backward_filter/algo.cpp b/dnn/src/cuda/convolution3d/backward_filter/algo.cpp new file mode 100644 index 00000000..0af54db1 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/algo.cpp @@ -0,0 +1,111 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +Convolution3DBackwardFilterImpl::AlgoPack::AlgoPack() { + non_cudnn_algos.push_back(&chanwise); + non_cudnn_algos.push_back(&inplace_matmul); + all_algos.push_back(&chanwise); // prefer chanwise + + fill_cudnn_algos(); + for (auto &&i: cudnn) { + all_algos.push_back(&i); + } + + all_algos.push_back(&inplace_matmul); + all_algos.reserve(all_algos.size() * 2); + + // add gconv algos by AlgoGroupConvGeneral + auto all_algos_data = all_algos.data(); + for (size_t i = 1; i < all_algos.size(); ++ i) { + gconv.push_back({all_algos[i]}); + } + for (size_t i = 1; i < all_algos.size(); ++ i) { + algo2gconv[all_algos[i]] = &gconv[i - 1]; + } + for (auto &&i: gconv) { + all_algos.push_back(&i); + } + megdnn_assert(all_algos_data == all_algos.data()); + non_cudnn_algos.push_back(all_algos.rbegin()[0]); //group inplace_matmul +} + +Convolution3DBackwardFilterImpl::AlgoCUDNN* +Convolution3DBackwardFilterImpl::AlgoPack::cudnn_from_enum( + cudnnConvolutionBwdFilterAlgo_t algo) { + for (auto &&i: cudnn) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw(megdnn_mangle(ssprintf( + "can not find cudnn bwd_filter algorithm %d", + static_cast(algo)))); +} + +Convolution3DBackwardFilterImpl::AlgoPack +Convolution3DBackwardFilterImpl::sm_algo_pack; + +Convolution3DBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( + Convolution3DBackwardFilterImpl *o, + const TensorLayout &src, const TensorLayout &diff, + const TensorLayout &grad): + SizeArgs(o, src, diff, o->check_layout_fwd(src, grad, diff)) +{ +} + +Convolution3DBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( + Convolution3DBackwardFilterImpl *o, + const TensorLayout &src, const TensorLayout &diff, + const CanonizedFilterMeta &grad): + handle{concrete_handle(o->handle())}, + src_layout{&src}, + diff_layout{&diff}, + grad_filter_meta{grad}, + opr{o} +{ +} + +Convolution3DBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs( + Convolution3DBackwardFilterImpl *opr, + _megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace): + SizeArgs(opr, src.layout, diff.layout, grad.layout), + src_tensor{&src}, diff_tensor{&diff}, grad_tensor{&grad}, + workspace{workspace} +{ +} + +std::string +Convolution3DBackwardFilterImpl::AlgoBase::SizeArgs::to_string() const { + auto &&fm = grad_filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return megdnn_mangle(ssprintf( + "src=%s diff=%s grad_filter=%u{%u,%u,%u,%u,%u}, " + "pad=%ux%ux%u, stride=%ux%ux%u, dilate=%ux%ux%u, xcorr=%d, dtype=%s,%s", + src_layout->to_string().c_str(), + diff_layout->to_string().c_str(), + fm.group, fm.ocpg, fm.icpg, + fm.spatial[0], fm.spatial[1], fm.spatial[2], + fm.padding[0], fm.padding[1], fm.padding[2], + fm.stride[0], fm.stride[1], fm.stride[2], + fm.dilation[0], fm.dilation[1], fm.dilation[2], + !fm.should_flip, + src_layout->dtype.name(), diff_layout->dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_filter/algo.h b/dnn/src/cuda/convolution3d/backward_filter/algo.h new file mode 100644 index 00000000..3750e25b --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/algo.h @@ -0,0 +1,202 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/convolution3d/helper.h" +#include + +namespace megdnn { +namespace cuda { + +class Convolution3DBackwardFilterImpl::AlgoBase: public Algorithm { + protected: + ~AlgoBase() = default; + + public: + struct SizeArgs { + HandleImpl *handle; + const TensorLayout *src_layout, *diff_layout; + CanonizedFilterMeta grad_filter_meta; + Convolution3DBackwardFilterImpl *opr; + + std::string to_string() const; + void init_desc(convolution3d::CUDNNBwdFilterDescs &desc) const { + desc.set(*src_layout, *diff_layout, grad_filter_meta, + opr->param()); + } + SizeArgs(Convolution3DBackwardFilterImpl *opr, + const TensorLayout &src, const TensorLayout &diff, + const TensorLayout &grad); + SizeArgs(Convolution3DBackwardFilterImpl *opr, + const TensorLayout &src, const TensorLayout &diff, + const CanonizedFilterMeta &grad); + + convolution3d::ForwardSizeArgs as_fwd_args() const { + return {handle, src_layout, grad_filter_meta, diff_layout, + opr->param().data_type}; + } + }; + struct ExecArgs: public SizeArgs { + const TensorND *src_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(Convolution3DBackwardFilterImpl *opr, + _megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs &args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs &args) const = 0; + virtual void exec(const ExecArgs &args) const = 0; + + bool is_available_wk(const SizeArgs &args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "conv bwd filter algo %s: " + "required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + virtual bool is_cudnn() const { + return false; + } +}; + +class Convolution3DBackwardFilterImpl::AlgoCUDNN final : public AlgoBase { + bool m_is_reproducible; + const char *m_name; + cudnnConvolutionBwdFilterAlgo_t m_cudnn_enum; + + public: + + AlgoCUDNN(bool is_reproducible, const char *name, + cudnnConvolutionBwdFilterAlgo_t cudnn_enum): + m_is_reproducible(is_reproducible), + m_name(name), + m_cudnn_enum(cudnn_enum) + {} + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + bool is_reproducible() const override { + return m_is_reproducible; + } + + const char* name() const override { + return m_name; + } + + cudnnConvolutionBwdFilterAlgo_t cudnn_enum() const { + return m_cudnn_enum; + } + + bool is_cudnn() const override { + return true; + } +}; + + +class Convolution3DBackwardFilterImpl::AlgoInplaceMatmul final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "INPLACE_MATMUL"; + } + bool is_reproducible() const override { + return false; + } +}; + +class Convolution3DBackwardFilterImpl::AlgoChanwise final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "CHANNEL_WISE"; + } + bool is_reproducible() const override { + return true; + } +}; + +//! implement group conv by another algo +class Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral final: public AlgoBase { + AlgoBase *m_impl; + std::string m_name; + + public: + AlgoGroupConvGeneral(AlgoBase *impl); + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return m_name.c_str(); + } + + bool is_reproducible() const override { + return m_impl->is_reproducible(); + } + + static void modify_size_args(SizeArgs &args, + TensorLayout &src_pg, TensorLayout &diff_pg); +}; + +class Convolution3DBackwardFilterImpl::AlgoPack { + // defined in cudnn.cpp + void fill_cudnn_algos(); + + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator = (const AlgoPack &) = delete; + + public: + AlgoPack(); + + std::vector cudnn; + AlgoInplaceMatmul inplace_matmul; + AlgoChanwise chanwise; + std::vector gconv; + std::unordered_map algo2gconv; + + std::vector + //! all algorithms + all_algos, + //! non-cudnn algos, used for heuristic if cudnn is not supported + non_cudnn_algos; + + AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdFilterAlgo_t algo); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_filter/chanwise.cpp b/dnn/src/cuda/convolution3d/backward_filter/chanwise.cpp new file mode 100644 index 00000000..55248bff --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/chanwise.cpp @@ -0,0 +1,60 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/chanwise.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution3d/chanwise/kern.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DBackwardFilterImpl::AlgoChanwise::is_available( + const SizeArgs &args) const { + auto &&fm = args.grad_filter_meta; + return fm.format == Param::Format::NCDHW && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + fm.spatial_ndim == 3 && fm.icpg == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && + fm.dilation[2] == 1 && + !fm.should_flip; +} + +size_t Convolution3DBackwardFilterImpl::AlgoChanwise::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void Convolution3DBackwardFilterImpl::AlgoChanwise::exec( + const ExecArgs &args) const { + auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); + auto stream = cuda_stream(args.handle); + switch (args.diff_layout->dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + { \ + using ctype = DTypeTrait<_dt>::ctype; \ + return chanwise::run_bwd_filter( \ + args.grad_tensor->ptr(), \ + args.src_tensor->ptr(), \ + args.diff_tensor->ptr(), \ + kparam, stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + default: + break; + } + megdnn_assert_internal(0); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp b/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp new file mode 100644 index 00000000..1ff883db --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp @@ -0,0 +1,94 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/cudnn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +#include "src/cuda/convolution3d/helper.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DBackwardFilterImpl::AlgoCUDNN::is_available( + const SizeArgs& args) const { + CUDNNBwdFilterDescs D; + + if (!is_cudnn_supported(args.as_fwd_args())) + return false; + + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc, + D.conv_desc.desc, D.grad_desc.desc, m_cudnn_enum, &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +size_t Convolution3DBackwardFilterImpl::AlgoCUDNN::get_workspace_in_bytes( + const SizeArgs& args) const { + CUDNNBwdFilterDescs D; + + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc, + D.conv_desc.desc, D.grad_desc.desc, m_cudnn_enum, &workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_filter get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + return workspace_size; +} + +void Convolution3DBackwardFilterImpl::AlgoCUDNN::exec( + const ExecArgs& args) const { + CUDNNBwdFilterDescs D; + args.init_desc(D); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionBackwardFilter( + args.handle->cudnn_handle(), &alpha, D.src_desc.desc, + args.src_tensor->raw_ptr, D.diff_desc.desc, + args.diff_tensor->raw_ptr, D.conv_desc.desc, m_cudnn_enum, + args.workspace.raw_ptr, args.workspace.size, &beta, + D.grad_desc.desc, args.grad_tensor->raw_ptr); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv bwd_data failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); +} + +void Convolution3DBackwardFilterImpl::AlgoPack::fill_cudnn_algos() { +#define V1(v) #v +#define V(v) V1(v) + +#define DEF_ALGO(NAME, REPROD) \ + cudnn.push_back({REPROD, \ + #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V( \ + CUDNN_PATCHLEVEL), \ + NAME}) + + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false); +#pragma message \ + "fp16 dilated conv with odd size filter, only algo_1 works, need focus on doc" + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false); + +#if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) +#pragma message "not latest cudnn" +#endif + +#undef DEF_ALGO + +#undef V +#undef V1 +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp b/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp new file mode 100644 index 00000000..b71e54a2 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp @@ -0,0 +1,83 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/group_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +void Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::modify_size_args( + Convolution3DBackwardFilterImpl::AlgoBase::SizeArgs &args, + TensorLayout &src_pg, TensorLayout &diff_pg) { + src_pg = *args.src_layout; + diff_pg = *args.diff_layout; + auto nr_grp = args.grad_filter_meta.group; + args.grad_filter_meta.group = 1; + src_pg.shape[1] /= nr_grp; + diff_pg.shape[1] /= nr_grp; + args.src_layout = &src_pg; + args.diff_layout = &diff_pg; +} + +Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( + AlgoBase *impl): + m_impl{impl} +{ + m_name = "group_conv3d:"; + m_name += impl->name(); +} + +bool Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::is_available( + const SizeArgs &args) const { + auto sub_args = args; + TensorLayout src_pg, diff_pg; + modify_size_args(sub_args, src_pg, diff_pg); + return m_impl->is_available(sub_args); +} + +size_t Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral:: +get_workspace_in_bytes(const SizeArgs &args) const { + auto sub_args = args; + TensorLayout src_pg, diff_pg; + modify_size_args(sub_args, src_pg, diff_pg); + return m_impl->get_workspace_in_bytes(sub_args); +} + +void Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::exec( + const ExecArgs &args) const { + auto sub_args = args; + TensorND tsrc{*args.src_tensor}, tdiff{*args.diff_tensor}, + tgrad{*args.grad_tensor}; + modify_size_args(sub_args, tsrc.layout, tdiff.layout); + sub_args.src_tensor = &tsrc; + sub_args.diff_tensor = &tdiff; + sub_args.grad_tensor = &tgrad; + + auto &&fm = args.grad_filter_meta; + auto grp = fm.group; + + auto strd_src = ( + tsrc.layout.stride[1] * fm.icpg * tsrc.layout.dtype.size()), + strd_diff = ( + tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), + strd_grad = (fm.icpg * fm.ocpg * + fm.spatial[0] * fm.spatial[1] * fm.spatial[2] * tgrad.layout.dtype.size()); + for (uint32_t g = 0; g < grp; ++ g) { + m_impl->exec(sub_args); + incr_voidp(tsrc.raw_ptr, strd_src); + incr_voidp(tdiff.raw_ptr, strd_diff); + incr_voidp(tgrad.raw_ptr, strd_grad); + } +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul.cpp b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul.cpp new file mode 100644 index 00000000..132a07b4 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul.cpp @@ -0,0 +1,68 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/inplace_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "./inplace_matmul_impl.cuh" + +using namespace megdnn; +using namespace cuda; + +bool Convolution3DBackwardFilterImpl::AlgoInplaceMatmul::is_available( + const SizeArgs &args) const { + auto &&fm = args.grad_filter_meta; + return args.grad_filter_meta.format == Param::Format::NCDHW && + args.src_layout->dtype == dtype::Float32() && + fm.group == 1 && fm.spatial_ndim == 3; +} + +size_t Convolution3DBackwardFilterImpl::AlgoInplaceMatmul::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void Convolution3DBackwardFilterImpl::AlgoInplaceMatmul::exec( + const ExecArgs &args) const { + auto &&fm = args.grad_filter_meta; + size_t N = args.src_layout->shape[0], + IC = fm.icpg, + ID = args.src_layout->shape[2], + IH = args.src_layout->shape[3], + IW = args.src_layout->shape[4], + OC = fm.ocpg, + OD = args.diff_layout->shape[2], + OH = args.diff_layout->shape[3], + OW = args.diff_layout->shape[4], + FD = fm.spatial[0], + FH = fm.spatial[1], + FW = fm.spatial[2], + DD = fm.dilation[0], + DH = fm.dilation[1], + DW = fm.dilation[2]; + auto stream = args.handle->stream(); + + convolution3d::exec_inplace_matmul_bwd_filter( + args.diff_tensor->ptr(), + args.src_tensor->ptr(), + args.grad_tensor->ptr(), + N, + args.src_layout->stride[0], + args.diff_layout->stride[0], + IC, ID, IH, IW, + OC, OD, OH, OW, + FD, FH, FW, + fm.padding[0], fm.padding[1], fm.padding[2], + fm.stride[0], fm.stride[1], fm.stride[2], + DD, DH, DW, + !fm.should_flip, stream); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu new file mode 100644 index 00000000..77a5b8f4 --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu @@ -0,0 +1,420 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./inplace_matmul_impl.cuh" +#include "src/cuda/utils.cuh" +#include +#include +using namespace megdnn; +using namespace cuda; + +namespace { + +struct BufferFetcherTexture { + cudaTextureObject_t tex; + + __device__ __forceinline__ float get(uint32_t offset) { + return tex1Dfetch(tex, offset); + } +}; + +struct BufferFetcherRaw { + const float *ptr; + + __device__ __forceinline__ float get(uint32_t offset) { + return ptr[offset]; + } +}; + +struct BufferFetcherTextureHost { + bool init_succ; + BufferFetcherTexture val; + + BufferFetcherTextureHost(float *p, const size_t n); + + ~BufferFetcherTextureHost() { + reset(); + } + + void reset() { + if (init_succ) { + cuda_check(cudaDestroyTextureObject(val.tex)); + init_succ = false; + } + } +}; + +BufferFetcherTextureHost::BufferFetcherTextureHost(float *p, const size_t n) { + init_succ = false; + cudaTextureObject_t tex_obj; + + cudaResourceDesc res_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = static_cast(p); + res_desc.res.linear.sizeInBytes = n*sizeof(float); + res_desc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaTextureDesc tex_desc; + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + if (cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL) == cudaSuccess) { + val.tex = tex_obj; + init_succ = true; + } else { + cudaGetLastError(); // reset error + } +} + +template +struct KernelPtr { + typedef void(*type)(BufferFetcher, BufferFetcher, float*, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); +}; + +//! 1 -> 0xffffffff, 0 -> 0x00000000 +__device__ __forceinline__ uint32_t bool_as_mask(uint32_t cond) { + return (!cond) - 1u; +} + +union FloatAndU32 { + float f; + uint32_t u; +}; + +//! \p mask must be either all 1 or 0 bits +template +__device__ __forceinline__ float visit_with_mask( + BufferFetcher buf, uint32_t offset, uint32_t mask) { + FloatAndU32 f; + f.f = buf.get(offset & mask); + f.u &= mask; + return f.f; +} + +__device__ __forceinline__ uint32_t with_dilation( + const uint32_t origin, const uint32_t D) { + return origin * D; +} + +template +__global__ void conv_kernel(BufferFetcher diff, BufferFetcher src, + float *grad, + const uint32_t N, const uint32_t INP_BS, const uint32_t OUT_BS, + const uint32_t IC, const uint32_t ID, const uint32_t IH, const uint32_t IW, + const uint32_t OC, const uint32_t OD, const uint32_t OH, const uint32_t OW, + const uint32_t FD, const uint32_t FH, const uint32_t FW, + const uint32_t SD, const uint32_t SH, const uint32_t SW, + const uint32_t PD, const uint32_t PH, const uint32_t PW, + const uint32_t DD, const uint32_t DH, const uint32_t DW) +{ + const uint32_t BM = BY < BX ? BY : BX; + + uint32_t n = blockIdx.z; + + const uint32_t tidx = threadIdx.x; + const uint32_t tidy = threadIdx.y; + const uint32_t posx = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t posy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t posx2 = posx<<2; + const uint32_t posy2 = posy<<2; + + const uint32_t heightA = OC; + const uint32_t widthA = OD*OH*OW; + const uint32_t heightB = widthA; + const uint32_t widthB = IC*FD*FH*FW; + + uint32_t ic0 = (posx2+0) / FW / FH / FD; + uint32_t fd0 = (posx2+0) / FW / FH % FD; + uint32_t fh0 = (posx2+0) / FW % FH; + uint32_t fw0 = (posx2+0) % FW; + + uint32_t ic1 = (posx2+1) / FW / FH / FD; + uint32_t fd1 = (posx2+1) / FW / FH % FD; + uint32_t fh1 = (posx2+1) / FW % FH; + uint32_t fw1 = (posx2+1) % FW; + + uint32_t ic2 = (posx2+2) / FW / FH / FD; + uint32_t fd2 = (posx2+2) / FW / FH % FD; + uint32_t fh2 = (posx2+2) / FW % FH; + uint32_t fw2 = (posx2+2) % FW; + + uint32_t ic3 = (posx2+3) / FW / FH / FD; + uint32_t fd3 = (posx2+3) / FW / FH % FD; + uint32_t fh3 = (posx2+3) / FW % FH; + uint32_t fw3 = (posx2+3) % FW; + + if (!is_xcorr) { + fd0 = FD - fd0 - 1; + fd1 = FD - fd1 - 1; + fd2 = FD - fd2 - 1; + fd3 = FD - fd3 - 1; + fh0 = FH - fh0 - 1; + fh1 = FH - fh1 - 1; + fh2 = FH - fh2 - 1; + fh3 = FH - fh3 - 1; + fw0 = FW - fw0 - 1; + fw1 = FW - fw1 - 1; + fw2 = FW - fw2 - 1; + fw3 = FW - fw3 - 1; + } + + const uint32_t fd0d = with_dilation(fd0, DD); + const uint32_t fd1d = with_dilation(fd1, DD); + const uint32_t fd2d = with_dilation(fd2, DD); + const uint32_t fd3d = with_dilation(fd3, DD); + + const uint32_t fh0d = with_dilation(fh0, DH); + const uint32_t fh1d = with_dilation(fh1, DH); + const uint32_t fh2d = with_dilation(fh2, DH); + const uint32_t fh3d = with_dilation(fh3, DH); + + const uint32_t fw0d = with_dilation(fw0, DW); + const uint32_t fw1d = with_dilation(fw1, DW); + const uint32_t fw2d = with_dilation(fw2, DW); + const uint32_t fw3d = with_dilation(fw3, DW); + + const uint32_t fp0 = ic0 * ID*IH*IW + fd0d * IH*IW + fh0d * IW + fw0d; + const uint32_t fp1 = ic1 * ID*IH*IW + fd1d * IH*IW + fh1d * IW + fw1d; + const uint32_t fp2 = ic2 * ID*IH*IW + fd2d * IH*IW + fh2d * IW + fw2d; + const uint32_t fp3 = ic3 * ID*IH*IW + fd3d * IH*IW + fh3d * IW + fw3d; + + const uint32_t OP = OH*OW; + + __shared__ float4 localA[BY][BM]; + __shared__ float4 localB[BM][BX]; + uint32_t i = 0u; + + uint32_t offsetA = n * OUT_BS + posy2 * widthA + tidx; + uint32_t offsetB = n * INP_BS - PD*IH*IW - PH*IW - PW; + + float4 sum0 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum1 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum2 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum3 = {0.0f, 0.0f, 0.0f, 0.0f}; + + uint32_t od = tidy / (OW*OH); + uint32_t oh = tidy / (OW) % OH; + uint32_t ow = tidy % OW; + uint32_t odm = tidy % (OW*OH); + + const uint32_t ods = BM / (OW*OH); + const uint32_t ohs = BM / (OW) % OH; + const uint32_t ows = BM % OW; + const uint32_t odms = BM % (OW*OH); + + for (; i < widthA; i += BM, offsetA += BM) { + // load localA + if (tidx < BM) { + localA[tidy][tidx].x = diff.get(offsetA + 0*widthA); + localA[tidy][tidx].y = diff.get(offsetA + 1*widthA); + localA[tidy][tidx].z = diff.get(offsetA + 2*widthA); + localA[tidy][tidx].w = diff.get(offsetA + 3*widthA); + } + if (tidy < BM) { + uint32_t tmp = offsetB + od*SD*IH*IW + oh*SH*IW + ow*SW, + ok = bool_as_mask(tidy+i < heightB), + p0 = bool_as_mask( + fd0d+od*SD >= PD && fd0d+od*SD < ID+PD && + fh0d+oh*SH >= PH && fh0d+oh*SH < IH+PH && + fw0d+ow*SW >= PW && fw0d+ow*SW < IW+PW), + p1 = bool_as_mask( + fd1d+od*SD >= PD && fd1d+od*SD < ID+PD && + fh1d+oh*SH >= PH && fh1d+oh*SH < IH+PH && + fw1d+ow*SW >= PW && fw1d+ow*SW < IW+PW), + p2 = bool_as_mask( + fd2d+od*SD >= PD && fd2d+od*SD < ID+PD && + fh2d+oh*SH >= PH && fh2d+oh*SH < IH+PH && + fw2d+ow*SW >= PW && fw2d+ow*SW < IW+PW), + p3 = bool_as_mask( + fd3d+od*SD >= PD && fd3d+od*SD < ID+PD && + fh3d+oh*SH >= PH && fh3d+oh*SH < IH+PH && + fw3d+ow*SW >= PW && fw3d+ow*SW < IW+PW); + + localB[tidy][tidx].x = visit_with_mask(src, tmp+fp0, ok & p0); + localB[tidy][tidx].y = visit_with_mask(src, tmp+fp1, ok & p1); + localB[tidy][tidx].z = visit_with_mask(src, tmp+fp2, ok & p2); + localB[tidy][tidx].w = visit_with_mask(src, tmp+fp3, ok & p3); + } + __syncthreads(); + for (uint32_t j = 0u; j < BM; ++j) { + float4 tmpA = localA[tidy][j]; + float4 tmpB = localB[j][tidx]; + sum0.x += tmpA.x * tmpB.x; + sum0.y += tmpA.x * tmpB.y; + sum0.z += tmpA.x * tmpB.z; + sum0.w += tmpA.x * tmpB.w; + sum1.x += tmpA.y * tmpB.x; + sum1.y += tmpA.y * tmpB.y; + sum1.z += tmpA.y * tmpB.z; + sum1.w += tmpA.y * tmpB.w; + sum2.x += tmpA.z * tmpB.x; + sum2.y += tmpA.z * tmpB.y; + sum2.z += tmpA.z * tmpB.z; + sum2.w += tmpA.z * tmpB.w; + sum3.x += tmpA.w * tmpB.x; + sum3.y += tmpA.w * tmpB.y; + sum3.z += tmpA.w * tmpB.z; + sum3.w += tmpA.w * tmpB.w; + + } + oh += ohs; + ow += ows; + oh += (ow >= OW); + ow -= (ow >= OW) * OW; + oh -= (oh >= OH) * OH; + + od += ods; + odm += odms; + od += (odm >= OP); + odm -= (odm >= OP) * OP; + __syncthreads(); + } + + // widthB == IC*FD*FH*FW, heightA == OC + const uint32_t grad_idx = posy2 * widthB + posx2; + bool y0 = (posy2+0 < heightA); + bool y1 = (posy2+1 < heightA); + bool y2 = (posy2+2 < heightA); + bool y3 = (posy2+3 < heightA); + bool x0 = (posx2+0 < widthB); + bool x1 = (posx2+1 < widthB); + bool x2 = (posx2+2 < widthB); + bool x3 = (posx2+3 < widthB); + if (y0) { + if (x0) atomicAdd(&grad[grad_idx + 0*widthB + 0], sum0.x); + if (x1) atomicAdd(&grad[grad_idx + 0*widthB + 1], sum0.y); + if (x2) atomicAdd(&grad[grad_idx + 0*widthB + 2], sum0.z); + if (x3) atomicAdd(&grad[grad_idx + 0*widthB + 3], sum0.w); + } + if (y1) { + if (x0) atomicAdd(&grad[grad_idx + 1*widthB + 0], sum1.x); + if (x1) atomicAdd(&grad[grad_idx + 1*widthB + 1], sum1.y); + if (x2) atomicAdd(&grad[grad_idx + 1*widthB + 2], sum1.z); + if (x3) atomicAdd(&grad[grad_idx + 1*widthB + 3], sum1.w); + } + if (y2) { + if (x0) atomicAdd(&grad[grad_idx + 2*widthB + 0], sum2.x); + if (x1) atomicAdd(&grad[grad_idx + 2*widthB + 1], sum2.y); + if (x2) atomicAdd(&grad[grad_idx + 2*widthB + 2], sum2.z); + if (x3) atomicAdd(&grad[grad_idx + 2*widthB + 3], sum2.w); + } + if (y3) { + if (x0) atomicAdd(&grad[grad_idx + 3*widthB + 0], sum3.x); + if (x1) atomicAdd(&grad[grad_idx + 3*widthB + 1], sum3.y); + if (x2) atomicAdd(&grad[grad_idx + 3*widthB + 2], sum3.z); + if (x3) atomicAdd(&grad[grad_idx + 3*widthB + 3], sum3.w); + } +} + +} // anonymous namespace + +void convolution3d::exec_inplace_matmul_bwd_filter( + const float *diff, const float *src, float *grad, + size_t N, size_t INP_BS, size_t OUT_BS, + size_t IC, size_t ID, size_t IH, size_t IW, + size_t OC, size_t OD, size_t OH, size_t OW, + size_t FD, size_t FH, size_t FW, + size_t PD, size_t PH, size_t PW, + size_t SD, size_t SH, size_t SW, + size_t DD, size_t DH, size_t DW, + bool is_xcorr, + cudaStream_t stream) { + BufferFetcherTextureHost diff_tex(const_cast(diff), OC*OD*OH*OW*N), + src_tex(const_cast(src), N * INP_BS); + BufferFetcherRaw diff_buf, src_buf; + src_buf.ptr = src; + diff_buf.ptr = diff; + if (!src_tex.init_succ || !diff_tex.init_succ) { + src_tex.reset(); + diff_tex.reset(); + } + int m = OC; + int n = IC*FD*FH*FW; + int BY = 1; + int BX = 1; + if (m <= 64) { + while (BY < 16 && (BY<<2) < m) BY <<= 1; + BX = 256 / BY; + } else if (n <= 64) { + while (BX < 16 && (BX<<2) < n) BX <<= 1; + BY = 256 / BX; + } else { + BX = BY = 16; + } + cudaMemset(grad, 0, OC * IC * FD * FH * FW * sizeof(float)); + dim3 blocks(DIVUP(n, 4*BX), DIVUP(m, 4*BY), N); + dim3 threads(BX, BY); +#define DISPATCH_BX_BY(BX, BY) do { \ + if (diff_tex.init_succ) { \ + KernelPtr::type kptr; \ + if (is_xcorr) { \ + kptr = conv_kernel; \ + } else { \ + kptr = conv_kernel; \ + } \ + kptr<<>>( \ + diff_tex.val, src_tex.val, grad, \ + N, INP_BS, OUT_BS, \ + IC, ID, IH, IW, \ + OC, OD, OH, OW, \ + FD, FH, FW, \ + SD, SH, SW, \ + PD, PH, PW, \ + DD, DH, DW); \ + } else { \ + KernelPtr::type kptr; \ + if (is_xcorr) { \ + kptr = conv_kernel; \ + } else { \ + kptr = conv_kernel; \ + } \ + kptr<<>>( \ + diff_buf, src_buf, grad, \ + N, INP_BS, OUT_BS, \ + IC, ID, IH, IW, \ + OC, OD, OH, OW, \ + FD, FH, FW, \ + SD, SH, SW, \ + PD, PH, PW, \ + DD, DH, DW); \ + } \ +} while (0) +#define DISPATCH_BX(BX) do { \ + DISPATCH_BX_BY(BX, 256/BX); \ +} while (0) +#define DISPATCH() do { \ + switch (BX) { \ + case 1: DISPATCH_BX(1); break; \ + case 2: DISPATCH_BX(2); break; \ + case 4: DISPATCH_BX(4); break; \ + case 8: DISPATCH_BX(8); break; \ + case 16: DISPATCH_BX(16); break; \ + case 32: DISPATCH_BX(32); break; \ + case 64: DISPATCH_BX(64); break; \ + case 128: DISPATCH_BX(128); break; \ + case 256: DISPATCH_BX(256); break; \ + default: \ + report_error("no usable kernel"); \ + } \ +} while (0) + DISPATCH(); +#undef DISPATCH +#undef DISPATCH_BX +#undef DISPATCH_BX_BY + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cuh b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cuh new file mode 100644 index 00000000..871056dd --- /dev/null +++ b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cuh @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace convolution3d { + +void exec_inplace_matmul_bwd_filter( + const float *diff, const float *src, float *grad, + size_t N, size_t INP_BS, size_t OUT_BS, + size_t IC, size_t ID, size_t IH, size_t IW, + size_t OC, size_t OD, size_t OH, size_t OW, + size_t FD, size_t FH, size_t FW, + size_t PD, size_t PH, size_t PW, + size_t SD, size_t SH, size_t SW, + size_t DD, size_t DH, size_t DW, + bool is_xcorr, + cudaStream_t stream); + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/chanwise/bwd_data.cu b/dnn/src/cuda/convolution3d/chanwise/bwd_data.cu new file mode 100644 index 00000000..598115ed --- /dev/null +++ b/dnn/src/cuda/convolution3d/chanwise/bwd_data.cu @@ -0,0 +1,215 @@ +/** + * \file dnn/src/cuda/convolution3d/chanwise/bwd_data.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; +using namespace chanwise; + +namespace { + +template +__global__ void kern_bwd_data( + T *src_grad, const T *dst_grad, const T *flt_tot, Param param) { + + extern __shared__ uint8_t flt_storage[]; + + T * const flt = reinterpret_cast(flt_storage); + + const uint32_t + N = param.batch, IC = param.src_chl, ic = blockIdx.x, + ID = param.src_d, IH = param.src_h, IW = param.src_w, + CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, + FD = FD_SET ? FD_SET : param.flt_d, + FH = FH_SET ? FH_SET : param.flt_h, + FW = FW_SET ? FW_SET : param.flt_w, + FSIZE = FD * FH * FW, + PD = param.pad_d, + PH = param.pad_h, + PW = param.pad_w, + SD = SD_SET ? SD_SET : param.stride_d, + SH = SH_SET ? SH_SET : param.stride_h, + SW = SW_SET ? SW_SET : param.stride_w, + OD = param.out_d, + OH = param.out_h, + OW = param.out_w, + TOT_OUT = N * ID * IH * IW; + + block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); + dst_grad += ic * CHL_MUL * OD * OH * OW; + src_grad += ic * ID * IH * IW; + + uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x, + nr_out_per_launch = blockDim.x * gridDim.y; + for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { + uint32_t out_idx = out_idx_, n, id, ih, iw; + out_idx = div_mod(out_idx, IW, iw); + out_idx = div_mod(out_idx, IH, ih); + out_idx = div_mod(out_idx, ID, id); + n = out_idx; + const T *dst_grad_base = dst_grad + n * (IC * CHL_MUL * OD * OH * OW); + + T sum(0); + + uint32_t odmin = max(int32_t(id + PD - FD + SD), 0) / SD, + ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH, + owmin = max(int32_t(iw + PW - FW + SW), 0) / SW, + odmax = min((id + PD) / SD, OD - 1), + ohmax = min((ih + PH) / SH, OH - 1), + owmax = min((iw + PW) / SW, OW - 1); + if (SD_SET == 1 && SH_SET == 1 && SW_SET == 1 && + FD_SET && FH_SET && FW_SET) { +#pragma unroll + for (uint32_t dod = 0; dod < FD; ++ dod) { + uint32_t od = odmin + dod; + if (od <= odmax) { + uint32_t fd = id - od * SD + PD; +#pragma unroll + for (uint32_t doh = 0; doh < FH; ++ doh) { + uint32_t oh = ohmin + doh; + if (oh <= ohmax) { + uint32_t fh = ih - oh * SH + PH; +#pragma unroll + for (uint32_t dow = 0; dow < FW; ++ dow) { + uint32_t ow = owmin + dow; + if (ow <= owmax) { + uint32_t fw = iw - ow * SW + PW; + const T *pd = dst_grad_base + + od * OH * OW + oh * OW + ow; + const T *pf = flt + + fd * FH * FW + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; + ++ chl_mul) { + sum += *pd * *pf; + pd += OD * OH * OW; + pf += FSIZE; + } + } + } + } + } + } + } + } else { + for (uint32_t od = odmin; od <= odmax; ++ od) { + uint32_t fd = id - od * SD + PD; + for (uint32_t oh = ohmin; oh <= ohmax; ++ oh) { + uint32_t fh = ih - oh * SH + PH; + for (uint32_t ow = owmin; ow <= owmax; ++ ow) { + uint32_t fw = iw - ow * SW + PW; + const T *pd = dst_grad_base + + od * OH * OW + oh * OW + ow; + const T *pf = flt + + fd * FH * FW + fh * FW + fw; +#pragma unroll + for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; ++ chl_mul) { + sum += *pd * *pf; + pd += OD * OH * OW; + pf += FSIZE; + } + } + } + } + } + src_grad[n * IC * ID * IH * IW + + id * IH * IW + ih * IW + iw] = sum; + } +} + +template +class KernDispatch { + public: + typedef void (*kern_ptr_t)(T*, const T*, const T*, Param); + + static kern_ptr_t dispatch( + int chl_mul, + int fd, int fh, int fw, + int sd, int sh, int sw) { + if (chl_mul == 1) { + if (fd == 2 && fh == 2 && fw == 2) + return d1<1, 2, 2, 2>(sd, sh, sw); + if (fd == 3 && fh == 3 && fw == 3) + return d1<1, 3, 3, 3>(sd, sh, sw); + } + return d1<0, 0, 0, 0>(sd, sh, sw); + } + + private: + template + static kern_ptr_t d1(int sd, int sh, int sw) { + if (sd == 1 && sh == 1 && sw == 1) + return kern_bwd_data; + if (sd == 1 && sh == 1 && sw == 2) + return kern_bwd_data; + if (sd == 1 && sh == 2 && sw == 1) + return kern_bwd_data; + if (sd == 1 && sh == 2 && sw == 2) + return kern_bwd_data; + if (sd == 2 && sh == 1 && sw == 1) + return kern_bwd_data; + if (sd == 2 && sh == 1 && sw == 2) + return kern_bwd_data; + if (sd == 2 && sh == 2 && sw == 1) + return kern_bwd_data; + if (sd == 2 && sh == 2 && sw == 2) + return kern_bwd_data; + return kern_bwd_data; + } +}; + +} // anonymous namespace + +template +void chanwise::run_bwd_data(T *src_grad, const T *dst_grad, const T *flt, + const Param ¶m, cudaStream_t stream) { + typename KernDispatch::kern_ptr_t kern = KernDispatch::dispatch( + param.chl_mul, + param.flt_d, param.flt_h, param.flt_w, + param.stride_d, param.stride_h, param.stride_w); + int nr_thread = query_blocksize_for_kernel(kern), + nr_out_dimx = param.src_d * param.src_h * param.src_w * param.batch; + dim3 nr_block( + param.src_chl, + std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); + uint32_t shared = param.chl_mul * param.flt_d * + param.flt_h * param.flt_w * sizeof(T); + kern <<< nr_block, nr_thread, shared, stream >>> ( + src_grad, dst_grad, flt, param); + after_kernel_launch(); +} + +namespace megdnn { +namespace cuda { +namespace convolution3d { +namespace chanwise { + +#define DO_INST(_ct) template void run_bwd_data( \ + _ct*, const _ct*, const _ct*, const Param&, cudaStream_t); +#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST) + +#undef INST +#undef DO_INST + +} // namespace chanwise +} // namespace convolution3d +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution3d/chanwise/bwd_filter.cu b/dnn/src/cuda/convolution3d/chanwise/bwd_filter.cu new file mode 100644 index 00000000..94338193 --- /dev/null +++ b/dnn/src/cuda/convolution3d/chanwise/bwd_filter.cu @@ -0,0 +1,201 @@ +/** + * \file dnn/src/cuda/convolution3d/chanwise/bwd_filter.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" + +const uint32_t WARP_SIZE = 32, BATCH_UNROLL = 4; + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; +using namespace chanwise; + +namespace { + +template +__global__ void kern_bwd_filter( + T *flt_grad, const T *src, const T *dst_grad, Param param) { + + const uint32_t + N = param.batch, IC = param.src_chl, + ID = param.src_d, IH = param.src_h, IW = param.src_w, + CHL_MUL = param.chl_mul, + FD = param.flt_d, FH = param.flt_h, FW = param.flt_w, + PD = param.pad_d, PH = param.pad_h, PW = param.pad_w, + SD = param.stride_d, SH = param.stride_h, SW = param.stride_w, + OD = param.out_d, OH = param.out_h, OW = param.out_w, + SRC_BATCH_STRIDE = IC * ID * IH * IW, + DST_BATCH_STRIDE = IC * CHL_MUL * OD * OH * OW, + BLKDIM_X = blockDim.x / nr_thpf, + THREADID_X = threadIdx.x / nr_thpf, + OUT_IDX = blockIdx.x * BLKDIM_X + THREADID_X; + + uint32_t ic, chl_mul, fd, fh, fw; + { + uint32_t i = OUT_IDX; + i = div_mod(i, FW, fw); + i = div_mod(i, FH, fh); + i = div_mod(i, FD, fd); + i = div_mod(i, CHL_MUL, chl_mul); + ic = i; + } + if (ic >= IC) { + return; + } + src += ic * ID * IH * IW; + dst_grad += (ic * CHL_MUL + chl_mul) * OD * OH * OW; + + const uint32_t + od_lo = max(int32_t(PD - fd + SD - 1), 0) / SD, + od_hi = min((ID - 1 + PD - fd) / SD + 1, OD), + oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH, + oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH), + ow_lo = max(int32_t(PW - fw + SW - 1), 0) / SW, + ow_hi = min((IW - 1 + PW - fw) / SW + 1, OW), + oblk_d = od_hi - od_lo, + oblk_h = oh_hi - oh_lo, + oblk_w = ow_hi - ow_lo, + oblk_tot = oblk_d * oblk_h * oblk_w * ((N + BATCH_UNROLL - 1) / BATCH_UNROLL), + tid = threadIdx.x % nr_thpf; + + if (ID + PD < fd + 1 || od_lo >= od_hi || + IH + PH < fh + 1 || oh_lo >= oh_hi || + IW + PW < fw + 1 || ow_lo >= ow_hi) { + if (!tid) + flt_grad[OUT_IDX] = 0; + return; + } + + T sum(0); + for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) { + uint32_t n, oh, ow, od; + n = div_mod(div_mod(div_mod(oblk_idx, oblk_w, ow), oblk_h, oh), oblk_d, od) * BATCH_UNROLL; + od += od_lo; + oh += oh_lo; + ow += ow_lo; + uint32_t id = od * SD - PD + fd, + ih = oh * SH - PH + fh, + iw = ow * SW - PW + fw, + soff = id * IH * IW + ih * IW + iw + n * SRC_BATCH_STRIDE, + doff = od * OH * OW + oh * OW + ow + n * DST_BATCH_STRIDE; +#pragma unroll + for (uint32_t i = 0; i < BATCH_UNROLL; ++ i) { + if (!i || n + i < N) { + sum += src[soff] * dst_grad[doff]; + } + soff += SRC_BATCH_STRIDE; + doff += DST_BATCH_STRIDE; + } + } + + if (nr_thpf == 1) { + flt_grad[OUT_IDX] = sum; + } else { + // reduce all sums in a block + extern __shared__ uint8_t shared_storage[]; + volatile T *thread_sum = reinterpret_cast(shared_storage); + thread_sum += THREADID_X * nr_thpf; + thread_sum[tid] = sum; +#pragma unroll + for (uint32_t i = nr_thpf / 2; i; i >>= 1) { + bool cond = nr_thpf >= i * 2 && tid < i; + if (i >= WARP_SIZE) { + __syncthreads(); + } + T v0 = thread_sum[tid], + v1 = v0 + thread_sum[tid + i]; + thread_sum[tid] = cond ? v1 : v0; + } + + if (!tid) + flt_grad[OUT_IDX] = thread_sum[0]; + } +} + +} // anonymous namespace + +template +void convolution3d::chanwise::run_bwd_filter( + T *filter_grad, const T *src, const T *dst_grad, + const Param ¶m, cudaStream_t stream) { + void (*kern)(T*, const T*, const T*, Param) = NULL; + uint32_t + nr_thread = query_blocksize_for_kernel(kern_bwd_filter), + nr_thpf = std::min(nr_thread, + std::max( + 1, + param.out_d * param.out_h * param.out_w * param.batch / + (BATCH_UNROLL * 16))); + + // find nearest power-of-2 of nr_thpf + do { +#define CK(_n) \ + if(nr_thpf >= _n) { \ + kern = kern_bwd_filter; \ + nr_thpf = _n; \ + break; \ + } + CK(1<<10); + CK(1<<9); + CK(1<<8); + CK(1<<7); + CK(1<<6); + CK(1<<5); + CK(1<<4); + CK(1<<3); + CK(1<<2); + CK(1<<1); + CK(1<<0); +#undef CK + } while(0); + + megdnn_assert(kern); + nr_thread = query_blocksize_for_kernel(kern); + + uint32_t nr_flt_per_blk = nr_thread / nr_thpf; + while (nr_flt_per_blk * nr_thpf % WARP_SIZE) + -- nr_flt_per_blk; + megdnn_assert(nr_flt_per_blk); + + int nr_block = DIVUP( + param.flt_d * param.flt_h * param.flt_w * + param.src_chl * param.chl_mul, + nr_flt_per_blk); + nr_thread = nr_flt_per_blk * nr_thpf; + uint32_t shared = nr_thread * 2 * sizeof(T); + kern <<< nr_block, nr_thread, shared, stream >>> ( + filter_grad, src, dst_grad, param); + after_kernel_launch(); +} + +namespace megdnn { +namespace cuda { +namespace convolution3d { +namespace chanwise { + +#define DO_INST(_ct) template void run_bwd_filter( \ + _ct*, const _ct*, const _ct*, const Param&, cudaStream_t); +#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST) + +#undef INST +#undef DO_INST + +} // namespace chanwise +} // namespace convolution3d +} // namespace cuda +} // namespace megdnn + + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution3d/chanwise/fwd.cu b/dnn/src/cuda/convolution3d/chanwise/fwd.cu new file mode 100644 index 00000000..e3c9d236 --- /dev/null +++ b/dnn/src/cuda/convolution3d/chanwise/fwd.cu @@ -0,0 +1,157 @@ +/** + * \file dnn/src/cuda/convolution3d/chanwise/fwd.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; +using namespace chanwise; + +namespace { + +template +__global__ void kern_fwd( + T *dst, const T *src, const T *flt_tot, Param param) { + + // extern __shared__ of dt_float16 does not work + extern __shared__ uint8_t flt_storage[]; + + T * const flt = reinterpret_cast(flt_storage); + + const uint32_t + N = param.batch, IC = param.src_chl, ic = blockIdx.x, + ID = param.src_d, IH = param.src_h, IW = param.src_w, + CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, + FD = FD_SET ? FD_SET : param.flt_d, + FH = FH_SET ? FH_SET : param.flt_h, + FW = FW_SET ? FW_SET : param.flt_w, + FSIZE = FD * FH * FW, + PD = param.pad_d, PH = param.pad_h, PW = param.pad_w, + SD = param.stride_d, SH = param.stride_h, SW = param.stride_w, + OD = param.out_d, OH = param.out_h, OW = param.out_w, + TOT_OUT = N * CHL_MUL * OD * OH * OW; + + block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); + + uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x, + nr_out_per_launch = blockDim.x * gridDim.y; + + for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { + uint32_t out_idx = out_idx_, n, chl_mul, od, oh, ow; + out_idx = div_mod(out_idx, OW, ow); + out_idx = div_mod(out_idx, OH, oh); + out_idx = div_mod(out_idx, OD, od); + if (CHL_MUL_SET == 1) { + chl_mul = 0; + n = out_idx; + } else { + n = div_mod(out_idx, CHL_MUL, chl_mul); + } + + int id = int(od * SD) - int(PD), + ih = int(oh * SH) - int(PH), + iw = int(ow * SW) - int(PW); + + const T* flt_base = flt + chl_mul * FSIZE; + const T* src_base = src + int((((n * IC + ic) * ID + id) * IH + ih) * IW + iw); + + T sum(0); + + if (FD_SET && FH_SET && FW_SET) { +#pragma unroll + for (uint32_t fd = 0; fd < FD; ++ fd) { + // fh + ih < 0 would overflow, so we do not need to check it + if (static_cast(fd + id) < ID) { +#pragma unroll + for (uint32_t fh = 0; fh < FH; ++ fh) { + if (static_cast(fh + ih) < IH) { +#pragma unroll + for(uint32_t fw = 0; fw < FW; ++ fw) { + if (static_cast(fw + iw) < IW) { + sum += flt_base[fd * FH * FW + fh * FW + fw] * + src_base[fd * IH * IW + fh * IW + fw]; + } + } + } + } + } + } + } else { + int fdmax = min(int(FD), int(ID - id)), + fhmax = min(int(FH), int(IH - ih)), + fwmax = min(int(FW), int(IW - iw)); + for (int fd = max(0, -id); fd < fdmax; ++ fd) { + for (int fh = max(0, -ih); fh < fhmax; ++ fh) { + for (int fw = max(0, -iw); fw < fwmax; ++ fw) { + sum += flt_base[fd * FH * FW + fh * FW + fw] * + src_base[fd * IH * IW + fh * IW + fw]; + } + } + } + } + dst[((((n * IC + ic) * CHL_MUL + chl_mul) * OD + od) * OH + oh) * OW + ow] = + sum; + } +} + +} // anonymous namespace + +template +void chanwise::run_fwd( + T *dst, const T *src, const T *flt, const Param ¶m, + cudaStream_t stream) { + void (*kern)(T*, const T*, const T*, Param); + if (param.chl_mul == 1) { + if (param.flt_d == 2 && param.flt_h == 2 && param.flt_w == 2) { + kern = kern_fwd; + } else if (param.flt_d == 3 && param.flt_h == 3 && param.flt_w == 3) { + kern = kern_fwd; + } else { + kern = kern_fwd; + } + } else { + kern = kern_fwd; + } + + int nr_thread = query_blocksize_for_kernel(kern), + nr_out_dimx = + param.out_d * param.out_h * param.out_w * param.batch * param.chl_mul; + dim3 nr_block( + param.src_chl, + std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); + uint32_t shared = param.chl_mul * param.flt_d * param.flt_h * param.flt_w * sizeof(T); + kern <<< nr_block, nr_thread, shared, stream >>> (dst, src, flt, param); + after_kernel_launch(); +} + +namespace megdnn { +namespace cuda { +namespace convolution3d { +namespace chanwise { + +#define DO_INST(_ct) template void run_fwd( \ + _ct*, const _ct*, const _ct*, const Param&, cudaStream_t); +#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST) + +#undef INST +#undef DO_INST + +} // namespace chanwise +} // namespace convolution3d +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution3d/chanwise/kern.cuh b/dnn/src/cuda/convolution3d/chanwise/kern.cuh new file mode 100644 index 00000000..af17650e --- /dev/null +++ b/dnn/src/cuda/convolution3d/chanwise/kern.cuh @@ -0,0 +1,83 @@ +/** + * \file dnn/src/cuda/convolution3d/chanwise/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/utils.cuh" + +#include +#include + +#if MEGDNN_CC_HOST +#include "src/cuda/convolution3d/helper.h" +#endif + +namespace megdnn { +namespace cuda { +namespace convolution3d { +namespace chanwise { + + struct Param { + uint32_t batch, src_chl, + src_d, src_h, src_w, + chl_mul, + flt_d, flt_h, flt_w, + out_d, out_h, out_w, + pad_d, pad_h, pad_w, + stride_d, stride_h, stride_w, + dilation_d, dilation_h, dilation_w; +#if MEGDNN_CC_HOST + static Param from_fwd_args(const ForwardSizeArgs &args) { +#define U(v) static_cast(v) + auto &&src = args.src_layout->shape; + auto &&dst = args.dst_layout->shape; + auto &&fm = args.filter_meta; + size_t c_pos, hw_pos; + if (fm.format == param::Convolution3D::Format::NCDHW) { + c_pos = 1; + hw_pos = 2; + } else { //NDHWC + c_pos = 4; + hw_pos = 1; + } + return { + U(src[0]), U(src[c_pos]), + U(src[hw_pos]), U(src[hw_pos+1]), U(src[hw_pos+2]), + U(fm.ocpg), + U(fm.spatial[0]), U(fm.spatial[1]), U(fm.spatial[2]), + U(dst[hw_pos]), U(dst[hw_pos+1]), U(dst[hw_pos+2]), + U(fm.padding[0]), U(fm.padding[1]), U(fm.padding[2]), + U(fm.stride[0]), U(fm.stride[1]), U(fm.stride[2]), + U(fm.dilation[0]), U(fm.dilation[1]), U(fm.dilation[2]), + }; +#undef U + } +#endif + }; + + template + void run_fwd(T *dst, const T *src, const T *flt, const Param ¶m, + cudaStream_t stream); + + template + void run_bwd_data(T *src_grad, const T *dst_grad, const T *flt, + const Param ¶m, cudaStream_t stream); + + template + void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, + const Param ¶m, cudaStream_t stream); + +} // namespace chanwise +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/chanwise/kern_helper.cuh b/dnn/src/cuda/convolution3d/chanwise/kern_helper.cuh new file mode 100644 index 00000000..759d0475 --- /dev/null +++ b/dnn/src/cuda/convolution3d/chanwise/kern_helper.cuh @@ -0,0 +1,55 @@ +/** + * \file dnn/src/cuda/convolution3d/chanwise/kern_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.cuh" +#include "megdnn/dtype.h" + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace convolution3d { +namespace chanwise { + + /*! + * \brief return a / b and set mod to a % b + */ + __device__ __forceinline__ uint32_t div_mod( + uint32_t a, uint32_t b, uint32_t &mod) { + uint32_t ret = a / b; + mod = a - ret * b; + return ret; + } + + /*! + * \brief copy a 2D matrix by all threads in a block + * \param rs row stride + */ + template + __device__ __forceinline__ void block_memcpy( + T *dst, const T *src, uint32_t size) { + for (uint32_t i = threadIdx.x; i < size; i += blockDim.x) { + dst[i] = src[i]; + } + __syncthreads(); + } + +} // namespace chanwise +} // namespace convolution3d +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen + diff --git a/dnn/src/cuda/convolution3d/forward/1x1x1.cpp b/dnn/src/cuda/convolution3d/forward/1x1x1.cpp new file mode 100644 index 00000000..a72e9fcd --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/1x1x1.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/1x1x1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.cuh" +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DForwardImpl::Algo1x1x1::is_available( + const SizeArgs &args) const { + auto &&fm = args.filter_meta; + const size_t MAX_WORKSPACE_SIZE = 2147483648; // 2 * 1024^3 + if (get_workspace_in_bytes(args) > MAX_WORKSPACE_SIZE) { + return false; + } + return fm.format == Param::Format::NCDHW && + (fm.dtype_enum == DTypeEnum::Float32 || + fm.dtype_enum == DTypeEnum::Float16) && + fm.spatial_ndim == 3 && fm.group == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && + fm.dilation[2] == 1 && + fm.spatial[0] == 1 && fm.spatial[1] == 1 && + fm.spatial[2] == 1 && + fm.padding[0] == 0 && fm.padding[1] == 0 && + fm.padding[2] == 0 && + fm.stride[0] == 1 && fm.stride[1] == 1 && + fm.stride[2] == 1; +} + +void Convolution3DForwardImpl::Algo1x1x1::extract_matmul_layouts( + const SizeArgs &args, + TensorLayout &A, TensorLayout &B, TensorLayout &C) { + auto &&fm = args.filter_meta; + A = {{fm.ocpg, fm.icpg}, DType::from_enum(fm.dtype_enum)}; + B.ndim = 2; + B.shape[0] = args.src_layout->shape[1]; + B.shape[1] = args.src_layout->shape[2] * args.src_layout->shape[3] * args.src_layout->shape[4]; + B.stride[0] = args.src_layout->stride[1]; + B.stride[1] = 1; + B.dtype = args.src_layout->dtype; + C = {{args.dst_layout->shape[1], B.shape[1]}, args.dst_layout->dtype}; +} +size_t Convolution3DForwardImpl::Algo1x1x1::get_workspace_in_bytes( + const SizeArgs &args) const { + TensorLayout A, B, C; + extract_matmul_layouts(args, A, B, C); + return args.handle->matmul_opr()->get_workspace_in_bytes(A, B, C); +} +void Convolution3DForwardImpl::Algo1x1x1::exec(const ExecArgs &args) const { + TensorND A, B, C; + extract_matmul_layouts(args, A.layout, B.layout, C.layout); + A.raw_ptr = args.filter_tensor->raw_ptr; + B.raw_ptr = args.src_tensor->raw_ptr; + C.raw_ptr = args.dst_tensor->raw_ptr; + size_t batch = args.src_layout->shape[0]; + auto mm = args.handle->matmul_opr(); + auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(), + strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size(); + for (size_t i = 0; i < batch; ++ i) { + mm->exec(A, B, C, args.workspace); + incr_voidp(B.raw_ptr, strd_B); + incr_voidp(C.raw_ptr, strd_C); + } +} +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/forward/algo.cpp b/dnn/src/cuda/convolution3d/forward/algo.cpp new file mode 100644 index 00000000..231c0f33 --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/algo.cpp @@ -0,0 +1,112 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +Convolution3DForwardImpl::AlgoPack::AlgoPack() { + non_cudnn_algos.push_back(&chanwise); + non_cudnn_algos.push_back(&inplace_matmul); + non_cudnn_algos.push_back(&a1x1x1); + + all_algos.push_back(&chanwise); + + fill_cudnn_algos(); + for (auto &&i: cudnn) { + all_algos.push_back(&i); + } + all_algos.push_back(&inplace_matmul); + all_algos.push_back(&a1x1x1); + all_algos.reserve(all_algos.size() * 2); + + // add gconv algos by AlgoGroupConvGeneral + auto all_algos_data = all_algos.data(); + for (size_t i = 1; i < all_algos.size(); ++ i) { + gconv.push_back({all_algos[i]}); + } + for (size_t i = 1; i < all_algos.size(); ++ i) { + algo2gconv[all_algos[i]] = &gconv[i - 1]; + } + for (auto &&i: gconv) { + all_algos.push_back(&i); + } + megdnn_assert(all_algos_data == all_algos.data()); + non_cudnn_algos.push_back(all_algos.rbegin()[1]); // group inplace_matmul + non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group 1x1x1 +} + +Convolution3DForwardImpl::AlgoCUDNN* +Convolution3DForwardImpl::AlgoPack::cudnn_from_enum( + cudnnConvolutionFwdAlgo_t algo) { + for (auto &&i: cudnn) { + if (i.cudnn_enum() == algo) + return &i; + } + megdnn_throw(megdnn_mangle(ssprintf("can not find cudnn fwd algorithm %d", + static_cast(algo)))); +} + +Convolution3DForwardImpl::AlgoPack Convolution3DForwardImpl::sm_algo_pack; + +Convolution3DForwardImpl::AlgoBase::SizeArgs::SizeArgs( + Convolution3DForwardImpl *o, + const TensorLayout &src, const TensorLayout &filter, + const TensorLayout &dst): + SizeArgs(o, src, o->check_layout_fwd(src, filter, dst), dst) +{ +} + +Convolution3DForwardImpl::AlgoBase::SizeArgs::SizeArgs( + Convolution3DForwardImpl *o, + const TensorLayout &src, const CanonizedFilterMeta &filter, + const TensorLayout &dst): + ForwardSizeArgs{ + concrete_handle(o->handle()), + &src, filter, &dst, + o->param().data_type + }, + opr{o} +{ +} + +Convolution3DForwardImpl::AlgoBase::ExecArgs::ExecArgs( + Convolution3DForwardImpl *opr, + _megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace): + SizeArgs(opr, src.layout, filter.layout, dst.layout), + src_tensor{&src}, filter_tensor{&filter}, dst_tensor{&dst}, + workspace{workspace} +{ +} + +std::string Convolution3DForwardImpl::AlgoBase::SizeArgs::to_string() const { + auto &&fm = filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return megdnn_mangle(ssprintf( + "src=%s, filter=%u{%u,%u,%u,%u,%u}, dst=%s, " + "pad=%ux%ux%u, stride=%ux%ux%u, dilate=%ux%ux%u, xcorr=%d, dtype=%s,%s", + src_layout->to_string().c_str(), + fm.group, fm.ocpg, fm.icpg, + fm.spatial[0], fm.spatial[1], fm.spatial[2], + dst_layout->to_string().c_str(), + fm.padding[0], fm.padding[1], fm.padding[2], + fm.stride[0], fm.stride[1], fm.stride[2], + fm.dilation[0], fm.dilation[1], fm.dilation[2], + !fm.should_flip, + src_layout->dtype.name(), dst_layout->dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/forward/algo.h b/dnn/src/cuda/convolution3d/forward/algo.h new file mode 100644 index 00000000..46974d68 --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/algo.h @@ -0,0 +1,222 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/cuda/convolution3d/helper.h" +#include "src/cuda/handle.h" +#include "src/cuda/convolution3d/opr_impl.h" +#include "src/common/utils.h" + +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for convolution3d algos + * + * All the algo impls should try to support non-contiguous batch dim, for group + * conv execution. + */ +class Convolution3DForwardImpl::AlgoBase: public Algorithm { + protected: + ~AlgoBase() = default; + + public: + struct SizeArgs: public convolution3d::ForwardSizeArgs { + Convolution3DForwardImpl *opr; + + std::string to_string() const; + void init_desc(convolution3d::CUDNNForwardDescs &desc) const { + desc.set(*src_layout, filter_meta, *dst_layout, opr->param()); + } + SizeArgs(Convolution3DForwardImpl *opr, + const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst); + SizeArgs(Convolution3DForwardImpl *opr, + const TensorLayout &src, + const CanonizedFilterMeta &filter, + const TensorLayout &dst); + }; + struct ExecArgs: public SizeArgs { + const TensorND *src_tensor, *filter_tensor, *dst_tensor; + Workspace workspace; + + ExecArgs(Convolution3DForwardImpl *opr, + _megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs &args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs &args) const = 0; + virtual void exec(const ExecArgs &args) const = 0; + + bool is_available_wk(const SizeArgs &args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "conv3d fwd algo %s: required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + virtual bool is_cudnn() const { + return false; + } +}; +class Convolution3DForwardImpl::Algo1x1x1 final: public AlgoBase { + static void extract_matmul_layouts(const SizeArgs &args, + TensorLayout &A, TensorLayout &B, TensorLayout &C); + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "1x1x1"; + } + bool is_reproducible() const override { + return true; + } +}; + +//! implement group conv by another algo +class Convolution3DForwardImpl::AlgoGroupConvGeneral final: public AlgoBase { + AlgoBase *m_impl; + std::string m_name; + + public: + AlgoGroupConvGeneral(AlgoBase *impl); + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return m_name.c_str(); + } + + bool is_reproducible() const override { + return m_impl->is_reproducible(); + } + + static void modify_size_args(SizeArgs &args, + TensorLayout &src_pg, TensorLayout &dst_pg); +}; + +class Convolution3DForwardImpl::AlgoCUDNN final : public AlgoBase { + bool m_is_reproducible; + const char *m_name; + cudnnConvolutionFwdAlgo_t m_cudnn_enum; + + public: + + AlgoCUDNN(bool is_reproducible, const char *name, + cudnnConvolutionFwdAlgo_t cudnn_enum): + m_is_reproducible(is_reproducible), + m_name(name), + m_cudnn_enum(cudnn_enum) + {} + + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + bool is_reproducible() const override { + return m_is_reproducible; + } + + const char* name() const override { + return m_name; + } + + cudnnConvolutionFwdAlgo_t cudnn_enum() const { + return m_cudnn_enum; + } + + bool is_cudnn() const override { + return true; + } +}; + +class Convolution3DForwardImpl::AlgoInplaceMatmul final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "INPLACE_MATMUL"; + } + bool is_reproducible() const override { + return true; + } +}; + + +class Convolution3DForwardImpl::AlgoChanwise final: public AlgoBase { + public: + bool is_available(const SizeArgs &args) const override; + size_t get_workspace_in_bytes(const SizeArgs &args) const override; + void exec(const ExecArgs &args) const override; + + const char* name() const override { + return "CHANNEL_WISE"; + } + bool is_reproducible() const override { + return true; + } +}; + +class Convolution3DForwardImpl::AlgoPack { + // defined in cudnn.cpp + void fill_cudnn_algos(); + + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator = (const AlgoPack &) = delete; + + public: + AlgoPack(); + + std::vector cudnn; + Algo1x1x1 a1x1x1; + AlgoInplaceMatmul inplace_matmul; + AlgoChanwise chanwise; + std::vector gconv; + std::unordered_map algo2gconv; + + std::vector + //! all algorithms + all_algos, + //! non-cudnn algos, used for heuristic if cudnn is not supported + non_cudnn_algos; + + AlgoCUDNN* cudnn_from_enum(cudnnConvolutionFwdAlgo_t algo); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/forward/chanwise.cpp b/dnn/src/cuda/convolution3d/forward/chanwise.cpp new file mode 100644 index 00000000..88a4a70c --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/chanwise.cpp @@ -0,0 +1,57 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/chanwise.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution3d/chanwise/kern.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DForwardImpl::AlgoChanwise::is_available( + const SizeArgs &args) const { + auto &&fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCDHW && + args.src_layout->dtype.category() == DTypeCategory::FLOAT && + fm.spatial_ndim == 3 && fm.icpg == 1 && + fm.dilation[0] == 1 && fm.dilation[1] == 1 && + fm.dilation[2] == 1 && !fm.should_flip; +} + +size_t Convolution3DForwardImpl::AlgoChanwise::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void Convolution3DForwardImpl::AlgoChanwise::exec(const ExecArgs &args) const { + auto kparam = chanwise::Param::from_fwd_args(args); + auto stream = cuda_stream(args.handle); + switch (args.src_layout->dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: \ + { \ + using ctype = DTypeTrait<_dt>::ctype; \ + return chanwise::run_fwd( \ + args.dst_tensor->ptr(), \ + args.src_tensor->ptr(), \ + args.filter_tensor->ptr(), \ + kparam, stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + default: + break; + } + megdnn_assert_internal(0); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/forward/cudnn.cpp b/dnn/src/cuda/convolution3d/forward/cudnn.cpp new file mode 100644 index 00000000..178d373a --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/cudnn.cpp @@ -0,0 +1,107 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/cudnn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/convolution3d/helper.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool Convolution3DForwardImpl::AlgoCUDNN::is_available( + const SizeArgs &args) const { + CUDNNForwardDescs D; + + if (!is_cudnn_supported(args)) + return false; + + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionForwardWorkspaceSize( + args.handle->cudnn_handle(), + D.src_desc.desc, + D.filter_desc.desc, + D.conv_desc.desc, + D.dst_desc.desc, + m_cudnn_enum, + &workspace_size); + return status == CUDNN_STATUS_SUCCESS; +} + +size_t Convolution3DForwardImpl::AlgoCUDNN::get_workspace_in_bytes( + const SizeArgs &args) const { + CUDNNForwardDescs D; + args.init_desc(D); + size_t workspace_size; + auto status = cudnnGetConvolutionForwardWorkspaceSize( + args.handle->cudnn_handle(), + D.src_desc.desc, + D.filter_desc.desc, + D.conv_desc.desc, + D.dst_desc.desc, + m_cudnn_enum, + &workspace_size); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv fwd get workspace failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); + return workspace_size; +} + +void Convolution3DForwardImpl::AlgoCUDNN::exec( + const ExecArgs &args) const { + CUDNNForwardDescs D; + args.init_desc(D); + float alpha = 1.0f, beta = 0.0f; + auto status = cudnnConvolutionForward(args.handle->cudnn_handle(), + &alpha, + D.src_desc.desc, args.src_tensor->raw_ptr, + D.filter_desc.desc, args.filter_tensor->raw_ptr, + D.conv_desc.desc, + m_cudnn_enum, + args.workspace.raw_ptr, + args.workspace.size, + &beta, + D.dst_desc.desc, + args.dst_tensor->raw_ptr); + megdnn_assert(status == CUDNN_STATUS_SUCCESS, + "conv fwd failed: %s; info: %s", + cudnnGetErrorString(status), args.to_string().c_str()); +} + + +void Convolution3DForwardImpl::AlgoPack::fill_cudnn_algos() { +#define V1(v) #v +#define V(v) V1(v) + +#define DEF_ALGO(NAME, REPROD) \ + cudnn.push_back({ \ + REPROD, #NAME \ + "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) \ + "." V(CUDNN_PATCHLEVEL), \ + NAME}) + +DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true); +DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true); +DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true); + +#if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) +#pragma message "not latest cudnn" +#endif + +#undef DEF_ALGO + +#undef V +#undef V1 +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/forward/group_conv.cpp b/dnn/src/cuda/convolution3d/forward/group_conv.cpp new file mode 100644 index 00000000..3eb2bf16 --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/group_conv.cpp @@ -0,0 +1,98 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/group_conv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +void Convolution3DForwardImpl::AlgoGroupConvGeneral::modify_size_args( + Convolution3DForwardImpl::AlgoBase::SizeArgs &args, + TensorLayout &src_pg, TensorLayout &dst_pg) { + src_pg = *args.src_layout; + dst_pg = *args.dst_layout; + auto nr_grp = args.filter_meta.group; + args.filter_meta.group = 1; + size_t c_pos; + if (args.filter_meta.format == Param::Format::NCDHW) { + c_pos = 1; + } else { + megdnn_assert(args.filter_meta.format == Param::Format::NDHWC, + "invalid conv format"); + c_pos = 4; + } + src_pg.shape[c_pos] /= nr_grp; + dst_pg.shape[c_pos] /= nr_grp; + args.src_layout = &src_pg; + args.dst_layout = &dst_pg; +} + +Convolution3DForwardImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( + AlgoBase *impl): + m_impl{impl} { + m_name = "group_conv3d:"; + m_name += impl->name(); +} + +bool Convolution3DForwardImpl::AlgoGroupConvGeneral::is_available( + const SizeArgs &args) const { + auto sub_args = args; + TensorLayout src_pg, dst_pg; + modify_size_args(sub_args, src_pg, dst_pg); + return m_impl->is_available(sub_args); +} + +size_t Convolution3DForwardImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( + const SizeArgs &args) const { + auto sub_args = args; + TensorLayout src_pg, dst_pg; + modify_size_args(sub_args, src_pg, dst_pg); + return m_impl->get_workspace_in_bytes(sub_args); +} + +void Convolution3DForwardImpl::AlgoGroupConvGeneral::exec( + const ExecArgs &args) const { + auto sub_args = args; + TensorND tsrc{*args.src_tensor}, tdst{*args.dst_tensor}, + tflt{*args.filter_tensor}; + modify_size_args(sub_args, tsrc.layout, tdst.layout); + sub_args.src_tensor = &tsrc; + sub_args.dst_tensor = &tdst; + sub_args.filter_tensor = &tflt; + + size_t c_pos; + if (args.filter_meta.format == Param::Format::NCDHW) { + c_pos = 1; + } else { + megdnn_assert(args.filter_meta.format == Param::Format::NDHWC, + "invalid conv format"); + c_pos = 4; + } + + auto grp = args.filter_meta.group; + + auto &&fm = args.filter_meta; + auto strd_src = tsrc.layout.stride[c_pos] * fm.icpg * tsrc.layout.dtype.size(), + strd_dst = tdst.layout.stride[c_pos] * fm.ocpg * tdst.layout.dtype.size(), + strd_flt = fm.icpg * fm.ocpg * + fm.spatial[0] * fm.spatial[1] * fm.spatial[2] * + tflt.layout.dtype.size(); + for (uint32_t g = 0; g < grp; ++ g) { + m_impl->exec(sub_args); + incr_voidp(tsrc.raw_ptr, strd_src); + incr_voidp(tdst.raw_ptr, strd_dst); + incr_voidp(tflt.raw_ptr, strd_flt); + } +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/forward/inplace_matmul.cpp b/dnn/src/cuda/convolution3d/forward/inplace_matmul.cpp new file mode 100644 index 00000000..b19afbf6 --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/inplace_matmul.cpp @@ -0,0 +1,65 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/inplace_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "./inplace_matmul_impl.cuh" + +using namespace megdnn; +using namespace cuda; + +bool Convolution3DForwardImpl::AlgoInplaceMatmul::is_available( + const SizeArgs &args) const { + auto &&fm = args.filter_meta; + return args.filter_meta.format == Param::Format::NCDHW && + args.src_layout->dtype == dtype::Float32() && + fm.group == 1 && fm.spatial_ndim == 3; +} + +size_t Convolution3DForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes( + const SizeArgs &) const { + return 0; +} + +void Convolution3DForwardImpl::AlgoInplaceMatmul::exec( + const ExecArgs &args) const { + auto &&fm = args.filter_meta; + size_t N = args.src_layout->shape[0], + IC = fm.icpg, + ID = args.src_layout->shape[2], + IH = args.src_layout->shape[3], + IW = args.src_layout->shape[4], + OC = fm.ocpg, + OD = args.dst_layout->shape[2], + OH = args.dst_layout->shape[3], + OW = args.dst_layout->shape[4], + FD = fm.spatial[0], + FH = fm.spatial[1], + FW = fm.spatial[2], + DD = fm.dilation[0], + DH = fm.dilation[1], + DW = fm.dilation[2]; + auto stream = args.handle->stream(); + convolution3d::exec_inplace_matmul_fwd( + args.src_tensor->ptr(), + args.filter_tensor->ptr(), + args.dst_tensor->ptr(), + N, args.src_layout->stride[0], args.dst_layout->stride[0], + IC, ID, IH, IW, + OC, OD, OH, OW, + FD, FH, FW, + fm.padding[0], fm.padding[1], fm.padding[2], + fm.stride[0], fm.stride[1], fm.stride[2], + DD, DH, DW, + !fm.should_flip, stream); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu new file mode 100644 index 00000000..37a1b51d --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu @@ -0,0 +1,395 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./inplace_matmul_impl.cuh" +#include "src/cuda/utils.cuh" +#include +#include +using namespace megdnn; +using namespace cuda; + +namespace { + +struct BufferFetcherTexture { + cudaTextureObject_t tex; + + __device__ __forceinline__ float get(uint32_t offset) { + return tex1Dfetch(tex, offset); + } +}; + +struct BufferFetcherRaw { + const float *ptr; + + __device__ __forceinline__ float get(uint32_t offset) { + return ptr[offset]; + } +}; + +struct BufferFetcherTextureHost { + bool init_succ; + BufferFetcherTexture val; + + BufferFetcherTextureHost(float *p, const size_t n); + + ~BufferFetcherTextureHost() { + reset(); + } + + void reset() { + if (init_succ) { + cuda_check(cudaDestroyTextureObject(val.tex)); + init_succ = false; + } + } +}; + +BufferFetcherTextureHost::BufferFetcherTextureHost(float *p, const size_t n) { + init_succ = false; + cudaTextureObject_t tex_obj; + + cudaResourceDesc res_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = static_cast(p); + res_desc.res.linear.sizeInBytes = n*sizeof(float); + res_desc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaTextureDesc tex_desc; + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + if (cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL) == cudaSuccess) { + val.tex = tex_obj; + init_succ = true; + } else { + cudaGetLastError(); // reset error + } +} + +template +struct KernelPtr { + typedef void(*type)(BufferFetcher, BufferFetcher, float*, + uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); +}; + +//! 1 -> 0xffffffff, 0 -> 0x00000000 +__device__ __forceinline__ uint32_t bool_as_mask(uint32_t cond) { + return (!cond) - 1u; +} + +union FloatAndU32 { + float f; + uint32_t u; +}; + +//! \p mask must be either all 1 or 0 bits +template +__device__ __forceinline__ float visit_with_mask( + BufferFetcher buf, uint32_t offset, uint32_t mask) { + FloatAndU32 f; + f.f = buf.get(offset & mask); + f.u &= mask; + return f.f; +} + +template +__global__ void conv_kernel(BufferFetcher src, BufferFetcher filter, + float *dst, + const uint32_t INP_BS, const uint32_t OUT_BS, + const uint32_t IC, const uint32_t ID, const uint32_t IH, const uint32_t IW, + const uint32_t OC, const uint32_t OD, const uint32_t OH, const uint32_t OW, + const uint32_t FD, const uint32_t FH, const uint32_t FW, + const uint32_t SD, const uint32_t SH, const uint32_t SW, + const uint32_t PD, const uint32_t PH, const uint32_t PW, + const uint32_t DD, const uint32_t DH, const uint32_t DW) +{ + const uint32_t BM = BY < BX ? BY : BX; + // BY*BX == 256 + // (OC) * (IC*FD*FH*FW) * (OD*OH*OW) + const uint32_t n = blockIdx.z; + const uint32_t tidx = threadIdx.x; + const uint32_t tidy = threadIdx.y; + const uint32_t posx = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t posy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t posx2 = posx<<2; + const uint32_t posy2 = posy<<2; + const uint32_t heightA = OC; + const uint32_t widthA = IC*FD*FH*FW; + const uint32_t heightB = widthA; + const uint32_t widthB = OD*OH*OW; + const uint32_t od0 = (posx2+0) / OW / OH * SD; + const uint32_t oh0 = (posx2+0) / OW % OH * SH; + const uint32_t ow0 = (posx2+0) % OW * SW; + const uint32_t op0 = od0 * IH * IW + oh0 * IW + ow0; + + const uint32_t od1 = (posx2+1) / OW / OH * SD; + const uint32_t oh1 = (posx2+1) / OW % OH * SH; + const uint32_t ow1 = (posx2+1) % OW * SW; + const uint32_t op1 = od1 * IH * IW + oh1 * IW + ow1; + + const uint32_t od2 = (posx2+2) / OW / OH * SD; + const uint32_t oh2 = (posx2+2) / OW % OH * SH; + const uint32_t ow2 = (posx2+2) % OW * SW; + const uint32_t op2 = od2 * IH * IW + oh2 * IW + ow2; + + const uint32_t od3 = (posx2+3) / OW / OH * SD; + const uint32_t oh3 = (posx2+3) / OW % OH * SH; + const uint32_t ow3 = (posx2+3) % OW * SW; + const uint32_t op3 = od3 * IH * IW + oh3 * IW + ow3; + const uint32_t FP = FD*FH*FW; + // OC % (BLOCK*4) == 0 + // IC*FD*FH*FW % BLOCK == 0 + // OD*OH*OW % (BLOCK*4) == 0 + __shared__ float4 localA[BY][BM]; + __shared__ float4 localB[BM][BX]; + uint32_t i = 0u; + uint32_t offsetA = posy2 * widthA + tidx; + uint32_t offsetB = n*INP_BS - PD*IH*IW - PH*IW - PW; + float4 sum0 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum1 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum2 = {0.0f, 0.0f, 0.0f, 0.0f}, + sum3 = {0.0f, 0.0f, 0.0f, 0.0f}; + uint32_t fd = tidy / FW / FH % FD; + uint32_t fh = tidy / FW % FH; + uint32_t fw = tidy % FW; + uint32_t ic = tidy / (FD*FH*FW); + uint32_t icm = tidy % (FD*FH*FW); + + const uint32_t fds = BM / FW / FH % FD; + const uint32_t fhs = BM / FW % FH; + const uint32_t fws = BM % FW; + const uint32_t ics = BM / (FD*FH*FW); + const uint32_t icms = BM % (FD*FH*FW); + + for (; i < widthA; i += BM, offsetA += BM) { + // load localA + if (tidx < BM) { + localA[tidy][tidx].x = filter.get(offsetA + 0*widthA); + localA[tidy][tidx].y = filter.get(offsetA + 1*widthA); + localA[tidy][tidx].z = filter.get(offsetA + 2*widthA); + localA[tidy][tidx].w = filter.get(offsetA + 3*widthA); + } + + // load localB + uint32_t fd2, fh2, fw2; + if (is_xcorr) { + fd2 = fd; + fh2 = fh; + fw2 = fw; + } else { + fd2 = FD-fd-1; + fh2 = FH-fh-1; + fw2 = FW-fw-1; + } + + if (tidy < BM) { + uint32_t fd2d = fd2 * DD, + fh2d = fh2 * DH, + fw2d = fw2 * DW; + uint32_t tmp = offsetB+ic*ID*IH*IW+fd2d*IH*IW+fh2d*IW+fw2d, + ok = bool_as_mask(tidy+i < heightB), + p0 = bool_as_mask( + fd2d+od0 >= PD && fd2d+od0 < ID+PD && + fh2d+oh0 >= PH && fh2d+oh0 < IH+PH && + fw2d+ow0 >= PW && fw2d+ow0 < IW+PW), + p1 = bool_as_mask( + fd2d+od1 >= PD && fd2d+od1 < ID+PD && + fh2d+oh1 >= PH && fh2d+oh1 < IH+PH && + fw2d+ow1 >= PW && fw2d+ow1 < IW+PW), + p2 = bool_as_mask( + fd2d+od2 >= PD && fd2d+od2 < ID+PD && + fh2d+oh2 >= PH && fh2d+oh2 < IH+PH && + fw2d+ow2 >= PW && fw2d+ow2 < IW+PW), + p3 = bool_as_mask( + fd2d+od3 >= PD && fd2d+od3 < ID+PD && + fh2d+oh3 >= PH && fh2d+oh3 < IH+PH && + fw2d+ow3 >= PW && fw2d+ow3 < IW+PW); + localB[tidy][tidx].x = visit_with_mask(src, tmp+op0, ok & p0); + localB[tidy][tidx].y = visit_with_mask(src, tmp+op1, ok & p1); + localB[tidy][tidx].z = visit_with_mask(src, tmp+op2, ok & p2); + localB[tidy][tidx].w = visit_with_mask(src, tmp+op3, ok & p3); + } + __syncthreads(); // die without this sync().. + for (uint32_t j = 0u; j < BM; ++j) { + float4 tmpA = localA[tidy][j]; + float4 tmpB = localB[j][tidx]; + sum0.x += tmpA.x * tmpB.x; + sum0.y += tmpA.x * tmpB.y; + sum0.z += tmpA.x * tmpB.z; + sum0.w += tmpA.x * tmpB.w; + sum1.x += tmpA.y * tmpB.x; + sum1.y += tmpA.y * tmpB.y; + sum1.z += tmpA.y * tmpB.z; + sum1.w += tmpA.y * tmpB.w; + sum2.x += tmpA.z * tmpB.x; + sum2.y += tmpA.z * tmpB.y; + sum2.z += tmpA.z * tmpB.z; + sum2.w += tmpA.z * tmpB.w; + sum3.x += tmpA.w * tmpB.x; + sum3.y += tmpA.w * tmpB.y; + sum3.z += tmpA.w * tmpB.z; + sum3.w += tmpA.w * tmpB.w; + } + fd += fds; + fw += fws; + fh += fhs; + + fh += (fw >= FW); + fw -= (fw >= FW) * FW; + fd += (fh >= FH); + fh -= (fh >= FH) * FH; + fd -= (fd >= FD) * FD; + + ic += ics; + icm += icms; + ic += (icm >= FP); + icm -= (icm >= FP) * FP; + + __syncthreads(); + } + const uint32_t dst_idx = n*OUT_BS + posy2*widthB + posx2; + bool y0 = (posy2+0 < heightA); + bool y1 = (posy2+1 < heightA); + bool y2 = (posy2+2 < heightA); + bool y3 = (posy2+3 < heightA); + bool x0 = (posx2+0 < widthB); + bool x1 = (posx2+1 < widthB); + bool x2 = (posx2+2 < widthB); + bool x3 = (posx2+3 < widthB); + if (y0) { + if (x0) dst[dst_idx + 0*widthB + 0] = sum0.x; + if (x1) dst[dst_idx + 0*widthB + 1] = sum0.y; + if (x2) dst[dst_idx + 0*widthB + 2] = sum0.z; + if (x3) dst[dst_idx + 0*widthB + 3] = sum0.w; + } + if (y1) { + if (x0) dst[dst_idx + 1*widthB + 0] = sum1.x; + if (x1) dst[dst_idx + 1*widthB + 1] = sum1.y; + if (x2) dst[dst_idx + 1*widthB + 2] = sum1.z; + if (x3) dst[dst_idx + 1*widthB + 3] = sum1.w; + } + if (y2) { + if (x0) dst[dst_idx + 2*widthB + 0] = sum2.x; + if (x1) dst[dst_idx + 2*widthB + 1] = sum2.y; + if (x2) dst[dst_idx + 2*widthB + 2] = sum2.z; + if (x3) dst[dst_idx + 2*widthB + 3] = sum2.w; + } + if (y3) { + if (x0) dst[dst_idx + 3*widthB + 0] = sum3.x; + if (x1) dst[dst_idx + 3*widthB + 1] = sum3.y; + if (x2) dst[dst_idx + 3*widthB + 2] = sum3.z; + if (x3) dst[dst_idx + 3*widthB + 3] = sum3.w; + } +} + +} // anonymous namespace + +void convolution3d::exec_inplace_matmul_fwd( + const float *src, const float *filter, float *dst, + size_t N, size_t INP_BS, size_t OUT_BS, + size_t IC, size_t ID, size_t IH, size_t IW, + size_t OC, size_t OD, size_t OH, size_t OW, + size_t FD, size_t FH, size_t FW, + size_t PD, size_t PH, size_t PW, + size_t SD, size_t SH, size_t SW, + size_t DD, size_t DH, size_t DW, + bool is_xcorr, + cudaStream_t stream) +{ + BufferFetcherTextureHost src_tex(const_cast(src), N * INP_BS), + filter_tex(const_cast(filter), OC*IC*FD*FH*FW); + BufferFetcherRaw src_buf, filter_buf; + src_buf.ptr = src; + filter_buf.ptr = filter; + if (!src_tex.init_succ || !filter_tex.init_succ) { + src_tex.reset(); + filter_tex.reset(); + } + int m = OC; + int n = OD*OH*OW; + int BY = 1; + int BX = 1; + if (m <= 64) { + while (BY < 16 && (BY<<2) < m) BY <<= 1; + BX = 256 / BY; + } else if (n <= 64) { + while (BX < 16 && (BX<<2) < n) BX <<= 1; + BY = 256 / BX; + } else { + BX = BY = 16; + } + dim3 blocks(DIVUP(OD*OH*OW, 4*BX), DIVUP(OC, 4*BY), N); + dim3 threads(BX, BY); +#define DISPATCH_BX_BY(BX, BY) do { \ + if (src_tex.init_succ) { \ + KernelPtr::type kptr; \ + if (is_xcorr) { \ + kptr = conv_kernel; \ + } else { \ + kptr = conv_kernel; \ + } \ + kptr<<>>( \ + src_tex.val, filter_tex.val, dst, \ + INP_BS, OUT_BS, \ + IC, ID, IH, IW, \ + OC, OD, OH, OW, \ + FD, FH, FW, \ + SD, SH, SW, \ + PD, PH, PW, \ + DD, DH, DW); \ + } else { \ + KernelPtr::type kptr; \ + if (is_xcorr) { \ + kptr = conv_kernel; \ + } else { \ + kptr = conv_kernel; \ + } \ + kptr<<>>( \ + src_buf, filter_buf, dst, \ + INP_BS, OUT_BS, \ + IC, ID, IH, IW, \ + OC, OD, OH, OW, \ + FD, FH, FW, \ + SD, SH, SW, \ + PD, PH, PW, \ + DD, DH, DW); \ + } \ +} while (0) +#define DISPATCH_BX(BX) do { \ + DISPATCH_BX_BY(BX, 256/BX); \ +} while (0) +#define DISPATCH() do { \ + switch (BX) { \ + case 1: DISPATCH_BX(1); break; \ + case 2: DISPATCH_BX(2); break; \ + case 4: DISPATCH_BX(4); break; \ + case 8: DISPATCH_BX(8); break; \ + case 16: DISPATCH_BX(16); break; \ + case 32: DISPATCH_BX(32); break; \ + case 64: DISPATCH_BX(64); break; \ + case 128: DISPATCH_BX(128); break; \ + case 256: DISPATCH_BX(256); break; \ + default: \ + report_error("no usable kernel"); \ + } \ +} while (0) + DISPATCH(); +#undef DISPATCH +#undef DISPATCH_BX +#undef DISPATCH_BX_BY + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cuh b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cuh new file mode 100644 index 00000000..98f80060 --- /dev/null +++ b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cuh @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace convolution3d { + +void exec_inplace_matmul_fwd(const float *src, const float *filter, float *dst, + size_t N, size_t INP_BS, size_t OUT_BS, + size_t IC, size_t ID, size_t IH, size_t IW, + size_t OC, size_t OD, size_t OH, size_t OW, + size_t FD, size_t FH, size_t FW, + size_t PD, size_t PH, size_t PW, + size_t SD, size_t SH, size_t SW, + size_t DD, size_t DH, size_t DW, + bool is_xcorr, + cudaStream_t stream); + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/helper.cpp b/dnn/src/cuda/convolution3d/helper.cpp new file mode 100644 index 00000000..478f71b1 --- /dev/null +++ b/dnn/src/cuda/convolution3d/helper.cpp @@ -0,0 +1,53 @@ +/** + * \file dnn/src/cuda/convolution3d/helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./helper.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +bool convolution3d::is_cudnn_supported(const ForwardSizeArgs &args) { + if (args.handle->is_tegra_k1()) + return false; + + if (args.src_layout->dtype.category() != DTypeCategory::FLOAT) + return false; + + if (args.filter_meta.format != param::Convolution3D::Format::NCDHW) + return false; + auto& fm = args.filter_meta; + return +#if CUDNN_MAJOR >= 7 + true +#else + fm.group == 1 +#endif + && fm.spatial_ndim == 3; +} + +void convolution3d::flip_filter(const ForwardSizeArgs &args, + const Workspace &workspace, void *&raw_ptr) { + auto &&fm = args.filter_meta; + megdnn_assert(fm.group == 1 && fm.spatial_ndim == 3); + auto OC = fm.ocpg, IC = fm.icpg, FD = fm.spatial[0], FH = fm.spatial[1], FW = fm.spatial[2]; + auto dtype = DType::from_enum(fm.dtype_enum); + megdnn_assert(workspace.size >= dtype.size() * OC * IC * FD * FH * FW); + TensorND src{raw_ptr, {{OC, IC, FD, FH, FW}, dtype}}, + dst{workspace.raw_ptr + (FD * FH * FW - 1) * dtype.size(), src.layout}; + dst.layout.stride[2] = -dst.layout.stride[2]; + dst.layout.stride[3] = -dst.layout.stride[3]; + dst.layout.stride[4] = -dst.layout.stride[4]; + args.handle->relayout_opr()->exec(src, dst); + raw_ptr = workspace.raw_ptr; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/helper.h b/dnn/src/cuda/convolution3d/helper.h new file mode 100644 index 00000000..8fe73658 --- /dev/null +++ b/dnn/src/cuda/convolution3d/helper.h @@ -0,0 +1,242 @@ +/** + * \file dnn/src/cuda/convolution3d/helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "./opr_impl.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/handle.h" +#include "src/common/utils.h" +#include "src/common/algo_chooser.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { +namespace convolution3d { + using CanonizedFilterMeta = Convolution3DForward::CanonizedFilterMeta; + + //! conv size descriptor in the forward view + struct ForwardSizeArgs { + HandleImpl *handle; + const TensorLayout *src_layout; + CanonizedFilterMeta filter_meta; + const TensorLayout *dst_layout; + param::Convolution3D::DataType data_type; + }; + + //! whether cudnn is supported for a filter meta + bool is_cudnn_supported(const ForwardSizeArgs &args); + + struct CUDNNForwardDescs { + Tensor3DDesc src_desc, dst_desc; + Filter3DDesc filter_desc; + Conv3DDesc conv_desc; + void set(const TensorLayout &src, + const CanonizedFilterMeta &filter, + const TensorLayout &dst, + const param::Convolution3D ¶m) + { + src_desc.set(src); + filter_desc.set(filter); + dst_desc.set(dst); + conv_desc.set(param, filter.group); + } + }; + + struct CUDNNBwdDataDescs { + Tensor3DDesc diff_desc, grad_desc; + Filter3DDesc filter_desc; + Conv3DDesc conv_desc; + void set(const CanonizedFilterMeta &filter, + const TensorLayout &diff, + const TensorLayout &grad, + const param::Convolution3D ¶m) + { + filter_desc.set(filter); + diff_desc.set(diff); + grad_desc.set(grad); + conv_desc.set(param, filter.group); + } + }; + + struct CUDNNBwdFilterDescs { + Tensor3DDesc diff_desc, src_desc; + Filter3DDesc grad_desc; + Conv3DDesc conv_desc; + void set(const TensorLayout &src, + const TensorLayout &diff, + const CanonizedFilterMeta &grad, + const param::Convolution3D ¶m) + { + src_desc.set(src); + diff_desc.set(diff); + grad_desc.set(grad); + conv_desc.set(param, grad.group); + } + }; + + /*! + * \brief flip conv filter + * + * Flip conv filter pointed by \p raw_ptr, store result in workspace, and + * change \p raw_ptr to workspace. + */ + void flip_filter(const ForwardSizeArgs &args, + const Workspace &workspace, void *&raw_ptr); + + inline bool cudnn_get_convolution_fwd_algo_helper( + cudnnHandle_t cudnn_handle, const cudnnTensorDescriptor_t x_desc, + const cudnnFilterDescriptor_t w_desc, + const cudnnConvolutionDescriptor_t conv_desc, + const cudnnTensorDescriptor_t y_desc, + size_t workspace_limit_in_bytes, cudnnConvolutionFwdAlgo_t* algo, + bool reproducible) { + MEGDNN_MARK_USED_VAR(reproducible); +#if CUDNN_MAJOR >= 7 + int algo_max_count = 0; + cudnn_check(cudnnGetConvolutionForwardAlgorithmMaxCount( + cudnn_handle, &algo_max_count)); + SmallVector algo_perf(algo_max_count); + int algo_count = 0; + cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7( + cudnn_handle, x_desc, w_desc, conv_desc, y_desc, algo_max_count, + &algo_count, algo_perf.data())); + for (int i = 0; i < algo_count; ++i) { + if (algo_perf[i].algo == + cudnnConvolutionFwdAlgo_t:: + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) + continue; + size_t workspace_size = 0; + cudnn_check(cudnnGetConvolutionForwardWorkspaceSize( + cudnn_handle, x_desc, w_desc, conv_desc, y_desc, + algo_perf[i].algo, &workspace_size)); + if (workspace_size > workspace_limit_in_bytes) continue; + if (!reproducible) { + *algo = algo_perf[i].algo; + return true; + } else { + if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) { + *algo = algo_perf[i].algo; + return true; + } + } + } + return false; +#else + cudnn_check(cudnnGetConvolutionForwardAlgorithm( + cudnn_handle, x_desc, w_desc, conv_desc, y_desc, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_in_bytes, algo)); + return true; +#endif + } + + inline bool cudnn_get_convolution_bwd_data_algo_helper( + cudnnHandle_t cudnn_handle, const cudnnFilterDescriptor_t w_desc, + const cudnnTensorDescriptor_t dy_desc, + const cudnnConvolutionDescriptor_t conv_desc, + const cudnnTensorDescriptor_t dx_desc, + size_t workspace_limit_in_bytes, + cudnnConvolutionBwdDataAlgo_t* algo, bool reproducible) { + MEGDNN_MARK_USED_VAR(reproducible); +#if CUDNN_MAJOR >= 7 + int algo_max_count = 0; + cudnn_check(cudnnGetConvolutionBackwardDataAlgorithmMaxCount( + cudnn_handle, &algo_max_count)); + SmallVector algo_perf( + algo_max_count); + int algo_count = 0; + cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7( + cudnn_handle, w_desc, dy_desc, conv_desc, dx_desc, + algo_max_count, &algo_count, algo_perf.data())); + for (int i = 0; i < algo_count; ++i) { + if (algo_perf[i].algo == + cudnnConvolutionBwdDataAlgo_t:: + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING) + continue; + size_t workspace_size = 0; + cudnn_check(cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnn_handle, w_desc, dy_desc, conv_desc, dx_desc, + algo_perf[i].algo, &workspace_size)); + if (workspace_size > workspace_limit_in_bytes) continue; + if (!reproducible) { + *algo = algo_perf[i].algo; + return true; + } else { + if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) { + *algo = algo_perf[i].algo; + return true; + } + } + } + return false; +#else + cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle, + w_desc, dy_desc, conv_desc, dx_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_in_bytes, + algo)); + return true; +#endif + } + + inline bool cudnn_get_convolution_bwd_filter_algo_helper( + cudnnHandle_t cudnn_handle, const cudnnTensorDescriptor_t x_desc, + const cudnnTensorDescriptor_t dy_desc, + const cudnnConvolutionDescriptor_t conv_desc, + const cudnnFilterDescriptor_t dw_desc, + size_t workspace_limit_in_bytes, + cudnnConvolutionBwdFilterAlgo_t* algo, bool reproducible) { + MEGDNN_MARK_USED_VAR(reproducible); +#if CUDNN_MAJOR >= 7 + int algo_max_count = 0; + cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + cudnn_handle, &algo_max_count)); + SmallVector algo_perf( + algo_max_count); + int algo_count = 0; + cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm_v7( + cudnn_handle, x_desc, dy_desc, conv_desc, dw_desc, + algo_max_count, &algo_count, algo_perf.data())); + for (int i = 0; i < algo_count; ++i) { + if (algo_perf[i].algo == + cudnnConvolutionBwdFilterAlgo_t::CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) + continue; + size_t workspace_size = 0; + cudnn_check(cudnnGetConvolutionBackwardFilterWorkspaceSize( + cudnn_handle, x_desc, dy_desc, conv_desc, dw_desc, + algo_perf[i].algo, &workspace_size)); + if (workspace_size > workspace_limit_in_bytes) continue; + if (!reproducible) { + *algo = algo_perf[i].algo; + return true; + } else { + if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) { + *algo = algo_perf[i].algo; + return true; + } + } + } + return false; +#else + cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm( + cudnn_handle, x_desc, dy_desc, conv_desc, dw_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_in_bytes, algo)); + return true; +#endif + } + + +} // namespace convolution3d +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/opr_impl.cpp b/dnn/src/cuda/convolution3d/opr_impl.cpp new file mode 100644 index 00000000..7c2715ed --- /dev/null +++ b/dnn/src/cuda/convolution3d/opr_impl.cpp @@ -0,0 +1,348 @@ +/** + * \file dnn/src/cuda/convolution3d/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./backward_data/algo.h" +#include "./backward_filter/algo.h" +#include "./forward/algo.h" +#include "./helper.h" + +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution3d; + +#define TO_STRING2(v) #v +#define TO_STRING(v) TO_STRING2(v) +#define CUDNN_VERSION_STR \ + TO_STRING(CUDNN_MAJOR) \ + "." TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) + +/* ============== Convolution3DForwardImpl ============== */ +Convolution3DForwardImpl::Algorithm* +Convolution3DForwardImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_limit_in_bytes, + bool reproducible) { + auto fm = check_layout_fwd(src, filter, dst); + return get_algorithm_heuristic(src, fm, dst, workspace_limit_in_bytes, + reproducible); +} +Convolution3DForwardImpl::Algorithm* +Convolution3DForwardImpl::get_algorithm_heuristic( + const TensorLayout& src, const CanonizedFilterMeta& filter, + const TensorLayout& dst, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, src, filter, dst); + +#if CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5) + if (args.filter_meta.group > 1) { + // prefer special chanwise impl since as the group conv of cudnn whose + // version is lower than v7.5.0 is still slower than our implementation + // in many channel-wise cases + if (sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.chanwise; + } + } +#endif + + auto prefer_1x1x1 = [&args, reproducible, workspace_limit_in_bytes]() { + const size_t MAX_BATCH_SIZE_FOR_1x1x1_MAT_ALGO = 4; + size_t batch_size = args.src_layout->shape[0]; + if (batch_size > MAX_BATCH_SIZE_FOR_1x1x1_MAT_ALGO) { + return false; + } + return sm_algo_pack.a1x1x1.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes); + }; + + auto get_cudnn_algo = + [this, &args, workspace_limit_in_bytes, + reproducible]() -> Convolution3DForwardImpl::AlgoBase* { + auto cudnn_handle = cuda::cudnn_handle(this->handle()); + cudnnConvolutionFwdAlgo_t algo; + CUDNNForwardDescs desc; + args.init_desc(desc); + + bool got = cudnn_get_convolution_fwd_algo_helper( + cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, + desc.conv_desc.desc, desc.dst_desc.desc, + workspace_limit_in_bytes, &algo, reproducible); + if (got) { + return static_cast( + megdnn::get_reproducible_algo( + sm_algo_pack.cudnn_from_enum(algo), reproducible)); + } else { + return nullptr; + } + }; + if (prefer_1x1x1()) { + return &sm_algo_pack.a1x1x1; + } + if (is_cudnn_supported(args)) { + if (auto algo = get_cudnn_algo()) + return algo; + } + if (args.filter_meta.group > 1) { + auto orig_args = args; + TensorLayout a, b; + AlgoGroupConvGeneral::modify_size_args(args, a, b); + if (prefer_1x1x1()) { + return sm_algo_pack.algo2gconv.at(&sm_algo_pack.a1x1x1); + } + if (is_cudnn_supported(args)) { + if (auto algo = get_cudnn_algo()) + return sm_algo_pack.algo2gconv.at(algo); + } + args = orig_args; + } + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv3d fwd"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv3d fwd"); + } +} + +std::vector +Convolution3DForwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) { + return megdnn::get_all_algorithms( + {this, src, filter, dst}); +} + +size_t Convolution3DForwardImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) { + AlgoBase::SizeArgs args(this, src, filter, dst); + return get_algorithm(this, src, args.filter_meta, dst) + ->get_workspace_in_bytes(args); +} + +void Convolution3DForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, src, filter, dst, workspace); + auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout); + algo->check_workspace(args, workspace).exec(args); +} + +const char* Convolution3DForwardImpl::get_algorithm_set_name() const { + return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; +} + +void Convolution3DBackwardDataImpl::exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); + auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout); + algo->check_workspace(args, workspace).exec(args); +} + +std::vector +Convolution3DBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) { + return megdnn::get_all_algorithms( + {this, filter, diff, grad}); +} + +Convolution3DBackwardDataImpl::Algorithm* +Convolution3DBackwardDataImpl::get_algorithm_heuristic( + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + auto fm = check_layout_fwd(grad, filter, diff); + return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes, + reproducible); +} + +Convolution3DBackwardDataImpl::Algorithm* +Convolution3DBackwardDataImpl::get_algorithm_heuristic( + const CanonizedFilterMeta& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, filter, diff, grad); + + if (args.filter_meta.group > 1 && + sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.chanwise; + } + + auto get_cudnn_algo = + [this, &args, workspace_limit_in_bytes, + reproducible]() -> Convolution3DBackwardDataImpl::AlgoBase* { + auto cudnn_handle = cuda::cudnn_handle(this->handle()); + cudnnConvolutionBwdDataAlgo_t algo; + CUDNNBwdDataDescs desc; + args.init_desc(desc); + bool got = cudnn_get_convolution_bwd_data_algo_helper( + cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc, + desc.conv_desc.desc, desc.grad_desc.desc, + workspace_limit_in_bytes, &algo, reproducible); + if (got) { + return static_cast(megdnn::get_reproducible_algo< + Convolution3DBackwardDataImpl>( + sm_algo_pack.cudnn_from_enum(algo), reproducible)); + } else { + return nullptr; + } + }; + + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return algo; + } + + if (args.filter_meta.group > 1) { + auto orig_args = args; + TensorLayout a, b; + AlgoGroupConvGeneral::modify_size_args(args, a, b); + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return sm_algo_pack.algo2gconv.at(algo); + } + args = orig_args; + } + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv3d bwd data"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv3d bwd data"); + } +} + +size_t Convolution3DBackwardDataImpl::get_workspace_in_bytes( + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args(this, filter, diff, grad); + return get_algorithm(this, args.filter_meta, diff, grad) + ->get_workspace_in_bytes(args); +} + +const char* Convolution3DBackwardDataImpl::get_algorithm_set_name() const { + return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; +} + +void Convolution3DBackwardFilterImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, src, diff, grad, workspace); + auto algo = + get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta); + algo->check_workspace(args, workspace).exec(args); +} + +std::vector +Convolution3DBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) { + return megdnn::get_all_algorithms( + {this, src, diff, grad}); +} + +Convolution3DBackwardFilterImpl::Algorithm* +Convolution3DBackwardFilterImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + auto fm = check_layout_fwd(src, grad, diff); + return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes, + reproducible); +} + +Convolution3DBackwardFilterImpl::Algorithm* +Convolution3DBackwardFilterImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& diff, + const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, src, diff, grad); + + if (args.grad_filter_meta.group > 1 && + sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.chanwise; + } + + auto get_cudnn_algo = + [this, &args, workspace_limit_in_bytes, + reproducible]() -> Convolution3DBackwardFilterImpl::AlgoBase* { + auto cudnn_handle = cuda::cudnn_handle(this->handle()); + cudnnConvolutionBwdFilterAlgo_t algo; + CUDNNBwdFilterDescs desc; + args.init_desc(desc); + bool got = cudnn_get_convolution_bwd_filter_algo_helper( + cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc, + desc.conv_desc.desc, desc.grad_desc.desc, + workspace_limit_in_bytes, &algo, reproducible); + if (got) { + return static_cast(megdnn::get_reproducible_algo< + Convolution3DBackwardFilterImpl>( + sm_algo_pack.cudnn_from_enum(algo), reproducible)); + } else { + return nullptr; + } + }; + + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return algo; + } + if (args.grad_filter_meta.group > 1) { + auto orig_args = args; + TensorLayout a, b; + AlgoGroupConvGeneral::modify_size_args(args, a, b); + if (is_cudnn_supported(args.as_fwd_args())) { + if (auto algo = get_cudnn_algo()) + return sm_algo_pack.algo2gconv.at(algo); + } + args = orig_args; + } + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv3d bwd filter"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, + "cuda conv3d bwd filter"); + } +} + +size_t Convolution3DBackwardFilterImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args(this, src, diff, grad); + return get_algorithm(this, src, diff, args.grad_filter_meta) + ->get_workspace_in_bytes(args); +} + +const char* Convolution3DBackwardFilterImpl::get_algorithm_set_name() const { + return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution3d/opr_impl.h b/dnn/src/cuda/convolution3d/opr_impl.h new file mode 100644 index 00000000..120b1fa2 --- /dev/null +++ b/dnn/src/cuda/convolution3d/opr_impl.h @@ -0,0 +1,140 @@ +/** + * \file dnn/src/cuda/convolution3d/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs/nn.h" + +namespace megdnn { +namespace cuda { + +class Convolution3DForwardImpl: public Convolution3DForward { + public: + using Convolution3DForward::Convolution3DForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + std::vector get_all_algorithms(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const CanonizedFilterMeta& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible); + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) override; + const char* get_algorithm_set_name() const override; + class AlgoBase; + class AlgoCUDNN; + class Algo1x1x1; + class AlgoInplaceMatmul; + class AlgoChanwise; + class AlgoGroupConvGeneral; + class AlgoPack; + static const AlgoPack& algo_pack() { + return sm_algo_pack; + } + private: + static AlgoPack sm_algo_pack; +}; + +class Convolution3DBackwardDataImpl: public Convolution3DBackwardData { + public: + using Convolution3DBackwardData::Convolution3DBackwardData; + void exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + std::vector get_all_algorithms(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + Algorithm* get_algorithm_heuristic(const CanonizedFilterMeta& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible); + size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoCUDNN; + class AlgoInplaceMatmul; + class AlgoChanwise; + class AlgoGroupConvGeneral; + + class AlgoPack; + + static const AlgoPack& algo_pack() { + return sm_algo_pack; + } + + private: + static AlgoPack sm_algo_pack; +}; + +class Convolution3DBackwardFilterImpl: public Convolution3DBackwardFilter { + public: + using Convolution3DBackwardFilter::Convolution3DBackwardFilter; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + std::vector get_all_algorithms(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& diff, + const CanonizedFilterMeta& grad, + size_t workspace_limit_in_bytes, + bool reproducible); + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoCUDNN; + class AlgoInplaceMatmul; + class AlgoChanwise; + class AlgoGroupConvGeneral; + + class AlgoPack; + + static const AlgoPack& algo_pack() { + return sm_algo_pack; + } + + private: + static AlgoPack sm_algo_pack; +}; +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution_helper/activation.cuh b/dnn/src/cuda/convolution_helper/activation.cuh new file mode 100644 index 00000000..4ac4397c --- /dev/null +++ b/dnn/src/cuda/convolution_helper/activation.cuh @@ -0,0 +1,115 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/activation.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/common/opr_param_defs_enumv.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct Activation; + +#define DEF_APPLY_AND_TRANSFORM(_act) \ + __device__ __forceinline__ int apply_and_transform(float4 in) { \ + return transform_float4_to_int8x4( \ + quantize(_act::apply(dequantize(in)))); \ + } + +template <> +struct Activation { + float scale; + float inv_scale; + MEGDNN_HOST MEGDNN_DEVICE Activation(float scale, float inv_scale) + : scale{scale}, inv_scale{inv_scale} {} +#if MEGDNN_CC_CUDA + DEF_APPLY_AND_TRANSFORM( + Activation); + __device__ __forceinline__ float4 dequantize(float4 in) { + return scale * in; + } + __device__ __forceinline__ float4 quantize(float4 in) { + return inv_scale * in; + } + __device__ __forceinline__ static float4 apply(float4 in) { + float x = in.x * fminf(fmaxf(in.x + 3.f, 0.f), 6.f) * (1.f / 6.f); + float y = in.y * fminf(fmaxf(in.y + 3.f, 0.f), 6.f) * (1.f / 6.f); + float z = in.z * fminf(fmaxf(in.z + 3.f, 0.f), 6.f) * (1.f / 6.f); + float w = in.w * fminf(fmaxf(in.w + 3.f, 0.f), 6.f) * (1.f / 6.f); + return make_float4(x, y, z, w); + } +#endif +}; + +template <> +struct Activation { + MEGDNN_HOST MEGDNN_DEVICE Activation(float /* scale */, + float /* inv_scale */) {} +#if MEGDNN_CC_CUDA + DEF_APPLY_AND_TRANSFORM( + Activation); + __device__ __forceinline__ float4 dequantize(float4 in) { return in; } + __device__ __forceinline__ float4 quantize(float4 in) { return in; } + __device__ __forceinline__ static float4 apply(float4 in) { + float x = in.x <= 0 ? 0 : in.x; + float y = in.y <= 0 ? 0 : in.y; + float z = in.z <= 0 ? 0 : in.z; + float w = in.w <= 0 ? 0 : in.w; + return make_float4(x, y, z, w); + } +#endif +}; + +template <> +struct Activation { + MEGDNN_HOST MEGDNN_DEVICE Activation(float /* scale */, + float /* inv_scale */) {} +#if MEGDNN_CC_CUDA + DEF_APPLY_AND_TRANSFORM( + Activation); + __device__ __forceinline__ float4 dequantize(float4 in) { return in; } + __device__ __forceinline__ float4 quantize(float4 in) { return in; } + __device__ __forceinline__ static float4 apply(float4 in) { return in; } +#endif +}; +#undef DEF_APPLY_AND_TRANSFORM + +#define MEGDNN_FOREACH_NONLINE_MODE(cb) cb(H_SWISH) cb(RELU) cb(IDENTITY) + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/bias_visitor.cuh b/dnn/src/cuda/convolution_helper/bias_visitor.cuh new file mode 100644 index 00000000..06f68139 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/bias_visitor.cuh @@ -0,0 +1,71 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/bias_visitor.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +struct PerChannelBiasVisitor { + const int32_t* __restrict__ bias; +#if MEGDNN_CC_CUDA + __host__ __device__ __forceinline__ void move(int, int ch, int, int) { + bias += ch; + } + __host__ __device__ __forceinline__ float4 at(int, int ch, int, int) { + int ix = *(bias + ch); + int iy = *(bias + ch + 1); + int iz = *(bias + ch + 2); + int iw = *(bias + ch + 3); + return ::make_float4(static_cast(ix), static_cast(iy), + static_cast(iz), static_cast(iw)); + } + __host__ __device__ __forceinline__ float4 at(int, int ch, int) { + int ix = *(bias + ch); + int iy = *(bias + ch + 1); + int iz = *(bias + ch + 2); + int iw = *(bias + ch + 3); + return ::make_float4(static_cast(ix), static_cast(iy), + static_cast(iz), static_cast(iw)); + } +#endif +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh b/dnn/src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh new file mode 100644 index 00000000..8150cf31 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh @@ -0,0 +1,41 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer.cuh" +#include "src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_unroll_width.cuh" +#include "src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer.cuh" +#include "src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer_unroll_width.cuh" + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer.cuh b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer.cuh new file mode 100644 index 00000000..c84b3938 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer.cuh @@ -0,0 +1,245 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvBlockConsumer; + +template +struct IConvBlockConsumer { + using ThreadConfig = ThreadConfig_; + using RegBlockConfig = RegBlockConfig_; + + int32_t reg_src[RegBlockConfig::reg_n][2]; + int32_t reg_filter[RegBlockConfig::reg_m][2]; + int32_t reg_acc[RegBlockConfig::reg_n][RegBlockConfig::reg_m]; + + __device__ __forceinline__ void init_accumulator() { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + reg_acc[i][j] = 0; + } + } + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + using smem_storage_dtype = + typename DataGlobal2ShareMemVisitor::smem_storage_dtype; + static bool const use_wide_store = !(RegBlockConfig::reg_n & 0x1); + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_n >> 1); ++i) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][0] = *(data_gl2sh_visitor.sh_ptr( + 0, tidx2 + i2 * ThreadConfig::nr_thread_x)); + reg_src[i2 + 1][0] = *(data_gl2sh_visitor.sh_ptr( + 0, tidx2 + i2 * ThreadConfig::nr_thread_x + 1)); + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { + reg_src[i][0] = *(data_gl2sh_visitor.sh_ptr( + 0, tidx + i * ThreadConfig::nr_thread_x)); + } + } + +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + smem_storage_dtype* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + 0, tidy * RegBlockConfig::pack_size + + j * ThreadConfig::nr_thread_y * + RegBlockConfig::pack_size); +#pragma unroll + for (int pack = 0; pack < RegBlockConfig::pack_size; ++pack) { + reg_filter[j * RegBlockConfig::pack_size + pack][0] = + *(ker_sh_ptr++); + } + } + +#pragma unroll + for (int ci_inner = 0; ci_inner < RegBlockConfig::reg_k_packed; + ++ci_inner) { + const int comp_idx = (ci_inner & 0x1); + const int load_idx = 1 - comp_idx; +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + dot_prod(reg_src[i][comp_idx], reg_filter[j][comp_idx], + reg_acc[i][j], reg_acc[i][j]); + } + } + + if (ci_inner < RegBlockConfig::reg_k_packed - 1) { + int32_t* data_sh_ptr = + data_gl2sh_visitor.sh_ptr(ci_inner + 1, 0); + int32_t* ker_sh_ptr = + filter_gl2sh_visitor.sh_ptr(ci_inner + 1, 0); + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_n >> 1); ++i) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][load_idx] = + data_sh_ptr[tidx2 + + i2 * ThreadConfig::nr_thread_x]; + reg_src[i2 + 1][load_idx] = + data_sh_ptr[tidx2 + + i2 * ThreadConfig::nr_thread_x + 1]; + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { + reg_src[i][load_idx] = + data_sh_ptr[tidx + + i * ThreadConfig::nr_thread_x]; + } + } + +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + smem_storage_dtype* ker_sh_ptr_packed = + &ker_sh_ptr[(tidy + j * ThreadConfig::nr_thread_y) * + RegBlockConfig::pack_size]; +#pragma unroll + for (int pack = 0; pack < RegBlockConfig::pack_size; + ++pack) { + reg_filter[j * RegBlockConfig::pack_size + pack] + [load_idx] = *(ker_sh_ptr_packed++); + } + } + } + } + } +}; + +template +struct IConvBlockConsumer { + using ThreadConfig = ThreadConfig_; + using RegBlockConfig = RegBlockConfig_; + + int32_t reg_src[RegBlockConfig::reg_n]; + int32_t reg_filter[RegBlockConfig::reg_m]; + int32_t reg_acc[RegBlockConfig::reg_n][RegBlockConfig::reg_m]; + + __device__ __forceinline__ void init_accumulator() { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + reg_acc[i][j] = 0; + } + } + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + using smem_storage_dtype = + typename DataGlobal2ShareMemVisitor::smem_storage_dtype; + static bool constexpr use_wide_store = !(RegBlockConfig::reg_n & 0x1); + +#pragma unroll + for (int ci_inner = 0; ci_inner < RegBlockConfig::reg_k_packed; + ++ci_inner) { + smem_storage_dtype* data_sh_ptr = + data_gl2sh_visitor.sh_ptr(ci_inner, 0); + smem_storage_dtype* ker_sh_ptr = + filter_gl2sh_visitor.sh_ptr(ci_inner, 0); + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_n >> 1); ++i) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2] = + data_sh_ptr[tidx2 + i2 * ThreadConfig::nr_thread_x]; + reg_src[i2 + 1] = + data_sh_ptr[tidx2 + i2 * ThreadConfig::nr_thread_x + + 1]; + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { + reg_src[i] = + data_sh_ptr[tidx + i * ThreadConfig::nr_thread_x]; + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + smem_storage_dtype* ker_sh_ptr_packed = + &ker_sh_ptr[(tidy + j * ThreadConfig::nr_thread_y) * + RegBlockConfig::pack_size]; +#pragma unroll + for (int pack = 0; pack < RegBlockConfig::pack_size; ++pack) { + reg_filter[j * RegBlockConfig::pack_size + pack] = + *(ker_sh_ptr_packed++); + } + } +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + dot_prod(reg_src[i], reg_filter[j], reg_acc[i][j], + reg_acc[i][j]); + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_coxhw.cuh b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_coxhw.cuh new file mode 100644 index 00000000..22430275 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_coxhw.cuh @@ -0,0 +1,263 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_coxhw.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvBlockConsumer_COxHW; + +template +struct IConvBlockConsumer_COxHW { + using ThreadConfig = ThreadConfig_; + using RegBlockConfig = RegBlockConfig_; + + int32_t reg_src[RegBlockConfig::reg_width][2]; + int32_t reg_filter[RegBlockConfig::reg_m][2]; + int32_t reg_acc[RegBlockConfig::reg_width][RegBlockConfig::reg_m]; + + __device__ __forceinline__ void init_accumulator() { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + reg_acc[i][j] = 0; + } + } + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + using smem_storage_dtype = + typename DataGlobal2ShareMemVisitor::smem_storage_dtype; + static bool const use_wide_store = !(RegBlockConfig::reg_width & 0x1); + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_width >> 1); ++i) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][0] = *(data_gl2sh_visitor.sh_ptr( + 0, tidx2 + i2 * ThreadConfig::nr_thread_x)); + reg_src[i2 + 1][0] = *(data_gl2sh_visitor.sh_ptr( + 0, tidx2 + i2 * ThreadConfig::nr_thread_x + 1)); + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { + reg_src[i][0] = *(data_gl2sh_visitor.sh_ptr( + 0, tidx + i * ThreadConfig::nr_thread_x)); + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + int out_channel = ((tidy + j * ThreadConfig::nr_thread_y) + << RegBlockConfig::pack_size_bit); +#pragma unroll + for (int packed = 0; packed < RegBlockConfig::pack_size; ++packed) { + reg_filter[j * RegBlockConfig::pack_size + packed][0] = + *(filter_gl2sh_visitor.sh_ptr(out_channel + packed, 0)); + } + } + +#pragma unroll + for (int ci_inner = 0; ci_inner < RegBlockConfig::reg_k_packed; + ++ci_inner) { + const int comp_idx = (ci_inner & 0x1); + const int load_idx = 1 - comp_idx; + if (ci_inner < RegBlockConfig::reg_k_packed - 1) { + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_width >> 1); ++i) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][load_idx] = *(data_gl2sh_visitor.sh_ptr( + ci_inner + 1, + tidx2 + i2 * ThreadConfig::nr_thread_x)); + reg_src[i2 + 1][load_idx] = *(data_gl2sh_visitor.sh_ptr( + ci_inner + 1, + tidx2 + i2 * ThreadConfig::nr_thread_x + 1)); + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { + reg_src[i][load_idx] = *(data_gl2sh_visitor.sh_ptr( + ci_inner + 1, + tidx + i * ThreadConfig::nr_thread_x)); + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + int out_channel = ((tidy + j * ThreadConfig::nr_thread_y) + << RegBlockConfig::pack_size_bit); +#pragma unroll + for (int packed = 0; packed < RegBlockConfig::pack_size; + ++packed) { + reg_filter[j * RegBlockConfig::pack_size + packed] + [load_idx] = *(filter_gl2sh_visitor.sh_ptr( + out_channel + packed, ci_inner + 1)); + } + } + } +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + // if (threadIdx.x == 0 && threadIdx.y == + // 0 && blockIdx.x == 0 && blockIdx.y == + // 0 && blockIdx.z == 0 && i == 0 && j == + // 1) { + // { + // int val = + // reg_src[i][comp_idx]; int8_t x + // = (val & 0xff), y = ((val >> + // 8) & 0xff), + // z = ((val >> 16) & + // 0xff), w = ((val >> 24) + // & 0xff); + // printf("src val = %d, %d, %d, + // %d\n", x, y, z, w); int cur = + // x + y + z + w; printf("partial + // sum = %d\n", cur); + // } + // { + // int val = + // reg_filter[j][comp_idx]; + // int8_t x = (val & 0xff), y = + // ((val >> 8) & 0xff), + // z = ((val >> 16) & + // 0xff), w = ((val >> 24) + // & 0xff); + // printf("filter val = %d, %d, + // %d, %d\n", x, y, z, w); + // } + // } + dot_prod(reg_src[i][comp_idx], reg_filter[j][comp_idx], + reg_acc[i][j], reg_acc[i][j]); + } + } + } + } +}; + +template +struct IConvBlockConsumer_COxHW { + using ThreadConfig = ThreadConfig_; + using RegBlockConfig = RegBlockConfig_; + + int32_t reg_src[RegBlockConfig::reg_width]; + int32_t reg_filter[RegBlockConfig::reg_m]; + int32_t reg_acc[RegBlockConfig::reg_width][RegBlockConfig::reg_m]; + + __device__ __forceinline__ void init_accumulator() { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + reg_acc[i][j] = 0; + } + } + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + using smem_storage_dtype = + typename DataGlobal2ShareMemVisitor::smem_storage_dtype; + static bool const use_wide_store = !(RegBlockConfig::reg_width & 0x1); + +#pragma unroll + for (int ci_inner = 0; ci_inner < RegBlockConfig::reg_k_packed; + ++ci_inner) { + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_width >> 1); ++i) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, tidx2 + i2 * ThreadConfig::nr_thread_x)); + reg_src[i2 + 1] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, + tidx2 + i2 * ThreadConfig::nr_thread_x + 1)); + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { + reg_src[i] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, tidx + i * ThreadConfig::nr_thread_x)); + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + int out_channel = ((tidy + j * ThreadConfig::nr_thread_y) + << RegBlockConfig::pack_size_bit); +#pragma unroll + for (int packed = 0; packed < RegBlockConfig::pack_size; + ++packed) { + reg_filter[j * RegBlockConfig::pack_size + packed] = + *(filter_gl2sh_visitor.sh_ptr(out_channel + + packed, + ci_inner)); + } + } +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_width; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m; ++j) { + dot_prod(reg_src[i], reg_filter[j], reg_acc[i][j], + reg_acc[i][j]); + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_unroll_width.cuh b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_unroll_width.cuh new file mode 100644 index 00000000..5b5d85e7 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_unroll_width.cuh @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_unroll_width.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvBlockConsumerUnrollWidth; + +template +struct IConvBlockConsumerUnrollWidth { + using ThreadConfig = ThreadConfig_; + using RegBlockConfig = RegBlockConfig_; + + int32_t reg_src[RegBlockConfig::reg_n][RegBlockConfig::reg_width][2]; + int32_t reg_filter[RegBlockConfig::reg_m][2]; + int32_t reg_acc[RegBlockConfig::reg_n][RegBlockConfig::reg_width] + [RegBlockConfig::reg_m]; + + __device__ __forceinline__ void init_accumulator() { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { +#pragma unroll + for (int k = 0; k < RegBlockConfig::reg_m; ++k) { + reg_acc[i][j][k] = 0; + } + } + } + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + using smem_storage_dtype = + typename DataGlobal2ShareMemVisitor::smem_storage_dtype; + static bool const use_wide_store = !(RegBlockConfig::reg_n & 0x1); + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_n >> 1); ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][j][0] = *(data_gl2sh_visitor.sh_ptr( + 0, j, tidx2 + i2 * ThreadConfig::nr_thread_x)); + reg_src[i2 + 1][j][0] = *(data_gl2sh_visitor.sh_ptr( + 0, j, tidx2 + i2 * ThreadConfig::nr_thread_x + 1)); + } + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + reg_src[i][j][0] = *(data_gl2sh_visitor.sh_ptr( + 0, j, tidx + i * ThreadConfig::nr_thread_x)); + } + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + 0, tidy * RegBlockConfig::pack_size + + j * ThreadConfig::nr_thread_y * + RegBlockConfig::pack_size); +#pragma unroll + for (int packed = 0; packed < RegBlockConfig::pack_size; ++packed) { + reg_filter[j * RegBlockConfig::pack_size + packed][0] = + *(ker_sh_ptr++); + } + } + +#pragma unroll + for (int ci_inner = 0; ci_inner < RegBlockConfig::reg_k_packed; + ++ci_inner) { + const int comp_idx = (ci_inner & 0x1); + const int load_idx = 1 - comp_idx; + if (ci_inner < RegBlockConfig::reg_k_packed - 1) { + int32_t* ker_sh_ptr = + filter_gl2sh_visitor.sh_ptr(ci_inner + 1, 0); + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_n >> 1); ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][j] + [load_idx] = *(data_gl2sh_visitor.sh_ptr( + ci_inner + 1, j, + tidx2 + i2 * ThreadConfig:: + nr_thread_x)); + reg_src[i2 + 1][j] + [load_idx] = *(data_gl2sh_visitor.sh_ptr( + ci_inner + 1, j, + tidx2 + + i2 * ThreadConfig:: + nr_thread_x + + 1)); + } + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + reg_src[i][j] + [load_idx] = *(data_gl2sh_visitor.sh_ptr( + ci_inner + 1, j, + tidx + i * ThreadConfig:: + nr_thread_x)); + } + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + int32_t* ker_sh_ptr_packed = + &ker_sh_ptr[(tidy + j * ThreadConfig::nr_thread_y) * + RegBlockConfig::pack_size]; +#pragma unroll + for (int packed = 0; packed < RegBlockConfig::pack_size; + ++packed) { + reg_filter[j * RegBlockConfig::pack_size + packed] + [load_idx] = *(ker_sh_ptr_packed++); + } + } + } +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { +#pragma unroll + for (int k = 0; k < RegBlockConfig::reg_m; ++k) { + dot_prod(reg_src[i][j][comp_idx], + reg_filter[k][comp_idx], reg_acc[i][j][k], + reg_acc[i][j][k]); + } + } + } + } + } +}; + +template +struct IConvBlockConsumerUnrollWidth { + using ThreadConfig = ThreadConfig_; + using RegBlockConfig = RegBlockConfig_; + + int32_t reg_src[RegBlockConfig::reg_n][RegBlockConfig::reg_width]; + int32_t reg_filter[RegBlockConfig::reg_m]; + int32_t reg_acc[RegBlockConfig::reg_n][RegBlockConfig::reg_width] + [RegBlockConfig::reg_m]; + + __device__ __forceinline__ void init_accumulator() { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { +#pragma unroll + for (int k = 0; k < RegBlockConfig::reg_m; ++k) { + reg_acc[i][j][k] = 0; + } + } + } + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + using smem_storage_dtype = + typename DataGlobal2ShareMemVisitor::smem_storage_dtype; + static bool const use_wide_store = !(RegBlockConfig::reg_n & 0x1); + +#pragma unroll + for (int ci_inner = 0; ci_inner < RegBlockConfig::reg_k_packed; + ++ci_inner) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr(ci_inner, 0); + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (RegBlockConfig::reg_n >> 1); ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + int i2 = (i << 1); + int tidx2 = (tidx << 1); + reg_src[i2][j] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, j, + tidx2 + i2 * ThreadConfig::nr_thread_x)); + reg_src[i2 + 1][j] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, j, + tidx2 + i2 * ThreadConfig::nr_thread_x + 1)); + } + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + reg_src[i][j] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, j, + tidx + i * ThreadConfig::nr_thread_x)); + } + } + } +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_m_packed; ++j) { + int32_t* ker_sh_ptr_packed = + &ker_sh_ptr[(tidy + j * ThreadConfig::nr_thread_y) * + RegBlockConfig::pack_size]; +#pragma unroll + for (int packed = 0; packed < RegBlockConfig::pack_size; + ++packed) { + reg_filter[j * RegBlockConfig::pack_size + packed] = + *(ker_sh_ptr_packed++); + } + } +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_n; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { +#pragma unroll + for (int k = 0; k < RegBlockConfig::reg_m; ++k) { + dot_prod(reg_src[i][j], reg_filter[k], reg_acc[i][j][k], + reg_acc[i][j][k]); + } + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer.cuh b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer.cuh new file mode 100644 index 00000000..bb2fb89e --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer.cuh @@ -0,0 +1,284 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvIMMABlockConsumer; + +template +struct IConvIMMABlockConsumer { + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + +#if __CUDA_ARCH__ >= 730 + typename IMMAConfig::fragment_b frag_src[WarpTileConfig::warp_tile_n][2]; + typename IMMAConfig::fragment_a frag_filter[WarpTileConfig::warp_tile_m][2]; + typename IMMAConfig::fragment_c frag_acc[WarpTileConfig::warp_tile_m] + [WarpTileConfig::warp_tile_n]; +#endif + + __device__ __forceinline__ void init_accumulator() { +#if __CUDA_ARCH__ >= 730 +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::fill_fragment(frag_acc[i][j], 0.f); + } + } +#endif + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { +#if __CUDA_ARCH__ >= 730 + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int warpx = tidx / ThreadConfig::warp_size; + const int warpy = tidy; + + static bool const use_wide_store = !(WarpTileConfig::warp_tile_n & 0x1); + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (WarpTileConfig::warp_tile_n >> 1); ++i) { + int i2 = (i << 1); + int warpx2 = (warpx << 1); + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + 0, (warpx2 + i2 * ThreadConfig::nr_warp_x) * + IMMAConfig::tile_b_sizes_int); + wmma::load_matrix_sync(frag_src[i2][0], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + wmma::load_matrix_sync( + frag_src[i2 + 1][0], + reinterpret_cast(data_sh_ptr + + IMMAConfig::tile_b_sizes_int), + IMMAConfig::wmma_k); + } + } else { +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_n; ++i) { + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + 0, (warpx + i * ThreadConfig::nr_warp_x) * + IMMAConfig::tile_b_sizes_int); + wmma::load_matrix_sync(frag_src[i][0], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + } + } +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_m; ++j) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + 0, (warpy + j * ThreadConfig::nr_warp_y) * + IMMAConfig::tile_a_sizes_int); + wmma::load_matrix_sync(frag_filter[j][0], + reinterpret_cast(ker_sh_ptr), + IMMAConfig::wmma_k); + } + +#pragma unroll + for (int ci_inner = 0; ci_inner < WarpTileConfig::warp_tile_k; + ++ci_inner) { + const int comp_idx = (ci_inner & 0x1); + const int load_idx = 1 - comp_idx; + if (ci_inner < WarpTileConfig::warp_tile_k - 1) { + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (WarpTileConfig::warp_tile_n >> 1); + ++i) { + int i2 = (i << 1); + int warpx2 = (warpx << 1); + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + ci_inner + 1, + (warpx2 + i2 * ThreadConfig::nr_warp_x) * + IMMAConfig::tile_b_sizes_int); + wmma::load_matrix_sync( + frag_src[i2][load_idx], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + wmma::load_matrix_sync( + frag_src[i2 + 1][load_idx], + reinterpret_cast( + data_sh_ptr + + IMMAConfig::tile_b_sizes_int), + IMMAConfig::wmma_k); + } + } else { +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_n; ++i) { + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + ci_inner + 1, + (warpx + i * ThreadConfig::nr_warp_x) * + IMMAConfig::tile_b_sizes_int); + wmma::load_matrix_sync( + frag_src[i][load_idx], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + } + } +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_m; ++j) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + ci_inner + 1, + (warpy + j * ThreadConfig::nr_warp_y) * + IMMAConfig::tile_a_sizes_int); + wmma::load_matrix_sync( + frag_filter[j][load_idx], + reinterpret_cast(ker_sh_ptr), + IMMAConfig::wmma_k); + } + } // end if use_wide_store +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::mma_sync(frag_acc[i][j], frag_filter[i][comp_idx], + frag_src[j][comp_idx], frag_acc[i][j]); + } + } + } // end ci_inner +#endif + } +}; + +template +struct IConvIMMABlockConsumer { + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + +#if __CUDA_ARCH__ >= 730 + typename IMMAConfig::fragment_b frag_src[WarpTileConfig::warp_tile_n]; + typename IMMAConfig::fragment_a frag_filter[WarpTileConfig::warp_tile_m]; + typename IMMAConfig::fragment_c frag_acc[WarpTileConfig::warp_tile_m] + [WarpTileConfig::warp_tile_n]; +#endif + + __device__ __forceinline__ void init_accumulator() { +#if __CUDA_ARCH__ >= 730 +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::fill_fragment(frag_acc[i][j], 0.f); + } + } +#endif + } + + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { +#if __CUDA_ARCH__ >= 730 + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int warpx = tidx / ThreadConfig::warp_size; + const int warpy = tidy; + + static bool const use_wide_store = !(WarpTileConfig::warp_tile_n & 0x1); +#pragma unroll + for (int ci_inner = 0; ci_inner < WarpTileConfig::warp_tile_k; + ++ci_inner) { + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < (WarpTileConfig::warp_tile_n >> 1); ++i) { + int i2 = (i << 1); + int warpx2 = (warpx << 1); + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + ci_inner, (warpx2 + i2 * ThreadConfig::nr_warp_x) * + IMMAConfig::tile_b_sizes_int); + wmma::load_matrix_sync( + frag_src[i2], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + wmma::load_matrix_sync( + frag_src[i2 + 1], + reinterpret_cast( + data_sh_ptr + IMMAConfig::tile_b_sizes_int), + IMMAConfig::wmma_k); + } + } else { +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_n; ++i) { + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + ci_inner, (warpx + i * ThreadConfig::nr_warp_x) * + IMMAConfig::tile_b_sizes_int); + wmma::load_matrix_sync( + frag_src[i], reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + } + } // end if use_wide_store +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_m; ++j) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + ci_inner, (warpy + j * ThreadConfig::nr_warp_y) * + IMMAConfig::tile_a_sizes_int); + wmma::load_matrix_sync(frag_filter[j], + reinterpret_cast(ker_sh_ptr), + IMMAConfig::wmma_k); + } +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::mma_sync(frag_acc[i][j], frag_filter[i], frag_src[j], + frag_acc[i][j]); + } + } + } // end for ci_inner +#endif + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer_unroll_width.cuh b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer_unroll_width.cuh new file mode 100644 index 00000000..be101be3 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer_unroll_width.cuh @@ -0,0 +1,199 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_consumer/iconv_imma_block_consumer_unroll_width.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvIMMABlockConsumerUnrollWidth { + using Conv1dConfig = Conv1dConfig_; + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + +#if __CUDA_ARCH__ >= 730 + typename IMMAConfig::fragment_b frag_src[WarpTileConfig::warp_tile_n][2]; + typename IMMAConfig::fragment_a frag_filter[WarpTileConfig::warp_tile_m][2]; + typename IMMAConfig::fragment_c frag_acc[WarpTileConfig::warp_tile_m] + [WarpTileConfig::warp_tile_n]; +#endif + + __device__ __forceinline__ void init_accumulator() { +#if __CUDA_ARCH__ >= 730 +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::fill_fragment(frag_acc[i][j], 0.f); + } + } +#endif + } + +#if __CUDA_ARCH__ >= 730 + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int warpx = tidx / ThreadConfig::warp_size; + const int warpy = tidy; + + static bool const consecutive_width_tile = + !(WarpTileConfig::warp_tile_n & 0x1); + if (consecutive_width_tile) { +#pragma unroll + for (int i = 0; i < (WarpTileConfig::warp_tile_n >> 1); ++i) { + int i2 = (i << 1); + int warpx2 = (warpx << 1); + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + (warpx2 + i2 * ThreadConfig::nr_warp_x) * + Conv1dConfig::sw, + 0); + wmma::load_matrix_sync(frag_src[i2][0], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + wmma::load_matrix_sync( + frag_src[i2 + 1][0], + reinterpret_cast( + data_sh_ptr + + Conv1dConfig::sw * + IMMAConfig::tile_b_sizes_int), + IMMAConfig::wmma_k); + } + } else { +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_n; ++i) { + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + (warpx + i * ThreadConfig::nr_warp_x) * + Conv1dConfig::sw, + 0); + wmma::load_matrix_sync(frag_src[i][0], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + } + } +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_m; ++j) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + 0, (warpy + j * ThreadConfig::nr_warp_y) * + IMMAConfig::tile_a_sizes_int); + wmma::load_matrix_sync(frag_filter[j][0], + reinterpret_cast(ker_sh_ptr), + IMMAConfig::wmma_k); + } + +#pragma unroll + for (int kw = 0; kw < Conv1dConfig::fw; ++kw) { + const int comp_idx = (kw & 0x1); + const int load_idx = 1 - comp_idx; + if (kw != Conv1dConfig::fw - 1) { + if (consecutive_width_tile) { +#pragma unroll + for (int i = 0; i < (WarpTileConfig::warp_tile_n >> 1); + ++i) { + int i2 = (i << 1); + int warpx2 = (warpx << 1); + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + (warpx2 + i2 * ThreadConfig::nr_warp_x) * + Conv1dConfig::sw + + kw + 1, + 0); + wmma::load_matrix_sync( + frag_src[i2][load_idx], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + wmma::load_matrix_sync( + frag_src[i2 + 1][load_idx], + reinterpret_cast( + data_sh_ptr + + Conv1dConfig::sw * + IMMAConfig::tile_b_sizes_int), + IMMAConfig::wmma_k); + } + } else { +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_n; ++i) { + int32_t* data_sh_ptr = data_gl2sh_visitor.sh_ptr( + (warpx + i * ThreadConfig::nr_warp_x) * + Conv1dConfig::sw + + kw + 1, + 0); + wmma::load_matrix_sync( + frag_src[i][load_idx], + reinterpret_cast(data_sh_ptr), + IMMAConfig::wmma_k); + } + } +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_m; ++j) { + int32_t* ker_sh_ptr = filter_gl2sh_visitor.sh_ptr( + kw + 1, (warpy + j * ThreadConfig::nr_warp_y) * + IMMAConfig::tile_a_sizes_int); + wmma::load_matrix_sync( + frag_filter[j][load_idx], + reinterpret_cast(ker_sh_ptr), + IMMAConfig::wmma_k); + } + } // end if ci_inner +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma unroll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::mma_sync(frag_acc[i][j], frag_filter[i][comp_idx], + frag_src[j][comp_idx], frag_acc[i][j]); + } + } + } // end for kw + } +#else + template + __device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor /* data_gl2sh_visitor */, + FilterGlobal2ShareMemVisitor /* filter_gl2sh_visitor */) {} +#endif +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh new file mode 100644 index 00000000..46cf16ad --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh @@ -0,0 +1,40 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh" +#include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh" +#include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh" + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh new file mode 100644 index 00000000..2b3764d7 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh @@ -0,0 +1,177 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct BlockTileIteratorBasic { + using DataTileCount = DataTileCount_; + using FilterTileCount = FilterTileCount_; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + int block_batch; + int block_out_channel; + int block_out_height; + int block_out_width; + int block_batch_remain; + int block_out_channel_remain; + + template + __device__ __forceinline__ void init_with_param(const Param& param) { + block_out_height = bidx / param.wo; + block_out_width = bidx - param.wo * block_out_height; + block_out_channel = bidz * FilterTileCount::block_tile_out_channel; + block_batch = bidy * DataTileCount::block_tile_batch; + block_batch_remain = param.n - block_batch; + block_out_channel_remain = param.co - block_out_channel; + } + + template + __device__ __forceinline__ void set_remain( + DataGlobal2ShareMemVisitor& src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& filter_gl2sh_visitor) { + src_gl2sh_visitor.remain = block_batch_remain; + filter_gl2sh_visitor.remain = block_out_channel_remain; + } + + template + __device__ __forceinline__ void set_remain( + GlobalMemoryWriter& global_memory_writer) { + global_memory_writer.block_batch_remain = block_batch_remain; + global_memory_writer.block_out_channel_remain = + block_out_channel_remain; + } + + template + __device__ __forceinline__ void iterate_with_param( + const src_dtype* __restrict__ src, + const filter_dtype* __restrict__ filter, const Param& param, + DataGlobal2ShareMemVisitor src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor, + BlockConsumer& consumer) { + InputLayout src_layout; + KernLayout filter_layout; + src_layout.init(param.n, param.ci, param.hi, param.wi); + filter_layout.init(param.co, param.ci, param.fh, param.fw); + const src_dtype* __restrict__ g_src_ptr = + src + src_layout.offset(block_batch, 0, 0, 0); + const filter_dtype* __restrict__ g_filter_ptr = + filter + filter_layout.offset(block_out_channel, 0, 0, 0); + src_gl2sh_visitor.init_stride(src_layout); + filter_gl2sh_visitor.init_stride(filter_layout); + + int h_base = block_out_height * param.sh - param.ph; + int w_base = block_out_width * param.sw - param.pw; + int h_start = h_base >= 0 ? h_base : 0; + int w_start = w_base >= 0 ? w_base : 0; + int h_end = h_base + param.fh - 1; + int w_end = w_base + param.fw - 1; + h_end = h_end < param.hi ? h_end : param.hi - 1; + w_end = w_end < param.wi ? w_end : param.wi - 1; + const int ci_blks = + (param.ci + DataTileCount::block_tile_in_channel - 1) / + DataTileCount::block_tile_in_channel; + int kh = h_start - h_base; + int kw = w_start - w_base; + + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor::copy_t*>( + g_src_ptr + src_layout.offset(0, 0, h_start, w_start)); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor::copy_t*>( + g_filter_ptr + filter_layout.offset(0, 0, kh, kw)); + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + for (int h = h_start; h <= h_end; ++h) { + for (int w = w_start; w <= w_end; ++w) { + for (int ci_outer = 0; ci_outer < ci_blks; ci_outer++) { + if (ci_outer == ci_blks - 1) { + if (!(h == h_end && w == w_end)) { + int w_next = w == w_end ? w_start : w + 1; + int h_next = w == w_end ? h + 1 : h; + int kh = h_next - h_base; + int kw = w_next - w_base; + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor:: + copy_t*>( + g_src_ptr + + src_layout.offset(0, 0, h_next, w_next)); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor:: + copy_t*>( + g_filter_ptr + + filter_layout.offset(0, 0, kh, kw)); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + } else { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + + consumer.template consume_block(src_gl2sh_visitor, + filter_gl2sh_visitor); + + if (!(ci_outer == ci_blks - 1 && h == h_end && + w == w_end)) { + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } + } +}; +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh new file mode 100644 index 00000000..5b7d5bfc --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh @@ -0,0 +1,192 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/prologue.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct BlockTileIterator_COxHW { + using DataTileCount = DataTileCount_; + using FilterTileCount = FilterTileCount_; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + int block_batch; + int block_out_channel; + int block_out_height_width; + int block_out_height; + int block_out_width; + int block_out_channel_remain; + int block_out_height_width_remain; + + template + __device__ __forceinline__ void init_with_param(const Param& param) { + block_batch = bidz; + block_out_height_width = + bidx * DataTileCount::block_tile_out_height_width; + block_out_channel = bidy * FilterTileCount::block_tile_out_channel; + block_out_height = block_out_height_width / param.wo; + block_out_width = block_out_height_width - block_out_height * param.wo; + block_out_channel_remain = param.co - block_out_channel; + block_out_height_width_remain = + param.ho * param.wo - block_out_height_width; + } + + template + __device__ __forceinline__ void set_remain( + DataGlobal2ShareMemVisitor& src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& filter_gl2sh_visitor) { + if (!DataGlobal2ShareMemVisitor::precomp_offset) { + src_gl2sh_visitor.remain = block_out_height_width_remain; + } + filter_gl2sh_visitor.remain = block_out_channel_remain; + } + + template + __device__ __forceinline__ void set_remain( + GlobalMemoryWriter& global_memory_writer) { + global_memory_writer.block_out_channel_remain = + block_out_channel_remain; + global_memory_writer.block_out_height_width_remain = + block_out_height_width_remain; + } + + template + __device__ __forceinline__ void iterate_with_param( + const src_dtype* __restrict__ src, + const filter_dtype* __restrict__ filter, const Param& param, + DataGlobal2ShareMemVisitor src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor, + BlockConsumer& consumer) { + Prologue::template prologue(src, filter, param, block_batch, + block_out_channel, block_out_height, + block_out_width); + static constexpr bool precomp_offset = + DataGlobal2ShareMemVisitor::precomp_offset; + InputLayout src_layout; + KernLayout filter_layout; + src_layout.init(param.n, param.ci, param.hi, param.wi); + filter_layout.init(param.co, param.ci, param.fh, param.fw); + const src_dtype* __restrict__ g_src_ptr; + if (precomp_offset) { + g_src_ptr = src + src_layout.offset(block_batch, 0, 0, 0); + } else { + g_src_ptr = + src + src_layout.offset(block_batch, 0, block_out_height, + block_out_width); + } + const filter_dtype* __restrict__ g_filter_ptr = + filter + filter_layout.offset(block_out_channel, 0, 0, 0); + + src_gl2sh_visitor.init_stride(src_layout); + filter_gl2sh_visitor.init_stride(filter_layout); + + const int ci_blks = + (param.ci + DataTileCount::block_tile_in_channel - 1) / + DataTileCount::block_tile_in_channel; + + if (precomp_offset) { + src_gl2sh_visitor.offset += block_out_height_width; + } + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor::copy_t*>(g_src_ptr); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor::copy_t*>( + g_filter_ptr); + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + const int filter_pixels = param.fh * param.fw; + const int img_pixels = param.ho * param.wo; + + for (int f = 0; f < filter_pixels; f++) { + for (int ci_outer = 0; ci_outer < ci_blks; ci_outer++) { + if (ci_outer == ci_blks - 1) { + if (f < filter_pixels - 1) { + int f_next = f + 1; + int kh = f_next / param.fw; + int kw = f_next - kh * param.fw; + // rewind + if (precomp_offset) { + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor:: + copy_t*>(g_src_ptr); + src_gl2sh_visitor.offset += img_pixels; + } + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor:: + copy_t*>( + g_filter_ptr + + filter_layout.offset(0, 0, kh, kw)); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + } else { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + + consumer.template consume_block(src_gl2sh_visitor, + filter_gl2sh_visitor); + + if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) { + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh new file mode 100644 index 00000000..d93ad24c --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh @@ -0,0 +1,184 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct BlockTileIteratorUnrollWidth { + using DataTileCount = DataTileCount_; + using FilterTileCount = FilterTileCount_; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + int block_batch; + int block_out_channel; + int block_out_height; + int block_out_width; + int block_batch_remain; + int block_out_channel_remain; + + template + __device__ __forceinline__ void init_with_param(const Param& param) { + const int blocks_per_image_row = + (param.wo + DataTileCount::block_tile_out_width - 1) / + DataTileCount::block_tile_out_width; + block_out_height = bidx / blocks_per_image_row; + block_out_width = bidx - blocks_per_image_row * block_out_height; + block_out_width = block_out_width * DataTileCount::block_tile_out_width; + block_out_channel = bidz * FilterTileCount::block_tile_out_channel; + block_batch = bidy * DataTileCount::block_tile_batch; + block_batch_remain = param.n - block_batch; + block_out_channel_remain = param.co - block_out_channel; + } + + template + __device__ __forceinline__ void set_remain( + DataGlobal2ShareMemVisitor& src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& filter_gl2sh_visitor) { + src_gl2sh_visitor.remain = block_batch_remain; + filter_gl2sh_visitor.remain = block_out_channel_remain; + } + + template + __device__ __forceinline__ void set_remain( + GlobalMemoryWriter& global_memory_writer) { + global_memory_writer.block_batch_remain = block_batch_remain; + global_memory_writer.block_out_channel_remain = + block_out_channel_remain; + } + + template + __device__ __forceinline__ void iterate_with_param( + const src_dtype* __restrict__ src, + const filter_dtype* __restrict__ filter, const Param& param, + DataGlobal2ShareMemVisitor src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor, + BlockConsumer& consumer) { + InputLayout src_layout; + KernLayout filter_layout; + src_layout.init(param.n, param.ci, param.hi, param.wi); + filter_layout.init(param.co, param.ci, param.fh, param.fw); + const src_dtype* __restrict__ g_src_ptr = + src + src_layout.offset(block_batch, 0, 0, 0); + const filter_dtype* __restrict__ g_filter_ptr = + filter + filter_layout.offset(block_out_channel, 0, 0, 0); + src_gl2sh_visitor.init_stride(src_layout); + filter_gl2sh_visitor.init_stride(filter_layout); + + int h_base = block_out_height * param.sh - param.ph; + int w_base = block_out_width * param.sw - param.pw; + int h_start = h_base >= 0 ? h_base : 0; + int h_end = h_base + param.fh - 1; + h_end = h_end < param.hi ? h_end : param.hi - 1; + int w_start = w_base; + int w_end = w_start + param.fw - 1; + const int ci_blks = + (param.ci + DataTileCount::block_tile_in_channel - 1) / + DataTileCount::block_tile_in_channel; + int kh = h_start - h_base; + + src_gl2sh_visitor.sw = param.sw; + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor::copy_t*>( + g_src_ptr + src_layout.offset(0, 0, h_start, w_start)); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor::copy_t*>( + g_filter_ptr + filter_layout.offset(0, 0, kh, 0)); + src_gl2sh_visitor.set_range(-w_start, param.wi - w_start); + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + for (int h = h_start; h <= h_end; ++h) { + for (int w = w_start; w <= w_end; ++w) { + for (int ci_outer = 0; ci_outer < ci_blks; ci_outer++) { + if (ci_outer == ci_blks - 1) { + if (!(h == h_end && w == w_end)) { + int w_next = w == w_end ? w_start : w + 1; + int h_next = w == w_end ? h + 1 : h; + int kh = h_next - h_base; + int kw = w_next - w_base; + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor:: + copy_t*>( + g_src_ptr + + src_layout.offset(0, 0, h_next, w_next)); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor:: + copy_t*>( + g_filter_ptr + + filter_layout.offset(0, 0, kh, kw)); + src_gl2sh_visitor.set_range(-w_next, + param.wi - w_next); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + } else { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + + consumer.template consume_block(src_gl2sh_visitor, + filter_gl2sh_visitor); + + if (!(ci_outer == ci_blks - 1 && h == h_end && + w == w_end)) { + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } + } +}; +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh new file mode 100644 index 00000000..3dffebbd --- /dev/null +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh @@ -0,0 +1,175 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct BlockTileIteratorUnrollWidthV2 { + using DataTileCount = DataTileCount_; + using FilterTileCount = FilterTileCount_; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + int block_batch; + int block_out_channel; + int block_out_height; + int block_out_width; + int block_in_width; + int block_batch_remain; + int block_out_channel_remain; + + template + __device__ __forceinline__ void init_with_param(const Param& param) { + const int blocks_per_image_row = + (param.wo + DataTileCount::block_tile_out_width - 1) / + DataTileCount::block_tile_out_width; + block_out_height = bidx / blocks_per_image_row; + block_out_width = bidx - blocks_per_image_row * block_out_height; + block_out_width = block_out_width * DataTileCount::block_tile_out_width; + block_out_channel = bidz * FilterTileCount::block_tile_out_channel; + block_batch = bidy * DataTileCount::block_tile_batch; + block_in_width = block_out_width * param.sw - param.pw; + block_batch_remain = param.n - block_batch; + block_out_channel_remain = param.co - block_out_channel; + } + + template + __device__ __forceinline__ void set_remain( + DataGlobal2ShareMemVisitor& src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& filter_gl2sh_visitor) { + src_gl2sh_visitor.remain = block_batch_remain; + filter_gl2sh_visitor.remain = block_out_channel_remain; + } + + template + __device__ __forceinline__ void set_remain( + GlobalMemoryWriter& global_memory_writer) { + global_memory_writer.block_batch_remain = block_batch_remain; + global_memory_writer.block_out_channel_remain = + block_out_channel_remain; + } + + template + __device__ __forceinline__ void iterate_with_param( + const src_dtype* __restrict__ src, + const filter_dtype* __restrict__ filter, const Param& param, + DataGlobal2ShareMemVisitor src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor, + BlockConsumer& consumer) { + InputLayout src_layout; + KernLayout filter_layout; + src_layout.init(param.n, param.ci, param.hi, param.wi); + filter_layout.init(param.co, param.ci, param.fh, param.fw); + const src_dtype* __restrict__ g_src_ptr = + src + src_layout.offset(block_batch, 0, 0, block_in_width); + const filter_dtype* __restrict__ g_filter_ptr = + filter + filter_layout.offset(block_out_channel, 0, 0, 0); + src_gl2sh_visitor.init_stride(src_layout); + filter_gl2sh_visitor.init_stride(filter_layout); + + int h_base = block_out_height * param.sh - param.ph; + int h_start = h_base >= 0 ? h_base : 0; + int h_end = h_base + param.fh - 1; + h_end = h_end < param.hi ? h_end : param.hi - 1; + + const int ci_blks = + (param.ci + DataTileCount::block_tile_in_channel - 1) / + DataTileCount::block_tile_in_channel; + int kh = h_start - h_base; + + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor::copy_t*>( + g_src_ptr + src_layout.offset(0, 0, h_start, 0)); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor::copy_t*>( + g_filter_ptr + filter_layout.offset(0, 0, kh, 0)); + src_gl2sh_visitor.set_range(-block_in_width, param.wi - block_in_width); + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + for (int h = h_start; h <= h_end; ++h) { + for (int ci_outer = 0; ci_outer < ci_blks; ci_outer++) { + if (ci_outer == ci_blks - 1) { + if (h != h_end) { + int h_next = h + 1; + int kh = h_next - h_base; + src_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename DataGlobal2ShareMemVisitor:: + copy_t*>( + g_src_ptr + src_layout.offset(0, 0, h_next, 0)); + filter_gl2sh_visitor.g_ptr = reinterpret_cast< + const typename FilterGlobal2ShareMemVisitor:: + copy_t*>( + g_filter_ptr + + filter_layout.offset(0, 0, kh, 0)); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + } else { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + + consumer.template consume_block(src_gl2sh_visitor, + filter_gl2sh_visitor); + + if (!(ci_outer == ci_blks - 1 && h == h_end)) { + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/config.cuh b/dnn/src/cuda/convolution_helper/config.cuh new file mode 100644 index 00000000..37f6f964 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/config.cuh @@ -0,0 +1,117 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/config.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#if CUDA_VERSION >= 10000 +#include +#endif + +namespace megdnn { +namespace cuda { +namespace convolution { +#if __CUDA_ARCH__ >= 730 +using namespace nvcuda; +#endif + +template +struct RegBlockConfig { + static int constexpr pack_size = 4; + static int constexpr pack_size_bit = 2; + static int constexpr reg_m = reg_m_; + static int constexpr reg_n = reg_n_; + static int constexpr reg_k = reg_k_; + MEGDNN_STATIC_ASSERT(reg_m % pack_size == 0, + "reg_m must be a multiple of pack_size"); + MEGDNN_STATIC_ASSERT(reg_k % pack_size == 0, + "reg_k must be a multiple of pack_size"); + static int constexpr reg_k_packed = reg_k / pack_size; + static int constexpr reg_m_packed = reg_m / pack_size; + static int constexpr reg_width = reg_width_; +}; + +template +struct ThreadConfig { + static int constexpr warp_size = 32; + static int constexpr nr_thread_x = thread_x; + static int constexpr nr_thread_y = thread_y; + static int constexpr nr_threads = nr_thread_x * nr_thread_y; + static int constexpr nr_warp_x = + !(nr_thread_x & 0x1f) ? (nr_thread_x >> 5) : 0; + static int constexpr nr_warp_y = !(nr_thread_x & 0x1f) ? nr_thread_y : 0; +}; +static int constexpr WARP_SIZE = ThreadConfig<1, 1>::warp_size; + +template +struct Conv1dConfig { + static int constexpr fw = fw_; + static int constexpr sw = sw_; +}; + +template +struct IMMAConfig { + static int constexpr wmma_m = m_; + static int constexpr wmma_n = n_; + static int constexpr wmma_k = k_; + static int constexpr tile_a_sizes_bytes = wmma_m * wmma_k; + static int constexpr tile_b_sizes_bytes = wmma_n * wmma_k; + static int constexpr tile_a_sizes_int = tile_a_sizes_bytes / 4; + static int constexpr tile_b_sizes_int = tile_b_sizes_bytes / 4; + static int constexpr tile_c_sizes_int = wmma_m * wmma_n; + static int constexpr wmma_n_bit = wmma_n == 8 ? 3 : (wmma_n == 16 ? 4 : 5); + static int constexpr wmma_m_bit = wmma_m == 8 ? 3 : (wmma_m == 16 ? 4 : 5); +#if __CUDA_ARCH__ >= 730 + using fragment_a = wmma::fragment; + using fragment_b = wmma::fragment; + using fragment_c = + wmma::fragment; +#endif +}; + +template +struct WarpTileConfig { + static int constexpr warp_tile_m = warp_tile_m_; + static int constexpr warp_tile_n = warp_tile_n_; + static int constexpr warp_tile_k = warp_tile_k_; + static int constexpr pack_size = sizeof(int32_t) / sizeof(int8_t); + static int constexpr pack_size_bit = 2; +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/conv_trait/conv_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/conv_trait.cuh new file mode 100644 index 00000000..80c58452 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/conv_trait/conv_trait.cuh @@ -0,0 +1,39 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/conv_trait/conv_trait.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh" +#include "src/cuda/convolution_helper/conv_trait/iconv_trait.cuh" + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh new file mode 100644 index 00000000..cc86f96c --- /dev/null +++ b/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh @@ -0,0 +1,231 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/block_tile_consumer/iconv_block_consumer_coxhw.cuh" +#include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh" +#include "src/cuda/convolution_helper/config.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixn.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_coxci.cuh" +#include "src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_coxhw.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +#define COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM( \ + _src_dtype, _filter_dtype, _smem_storage_dtype, _input_layout, \ + _kern_layout, _output_layout, _conv_param) \ + using src_dtype = _src_dtype; \ + using filter_dtype = _filter_dtype; \ + using smem_storage_dtype = _smem_storage_dtype; \ + using InputLayout = _input_layout; \ + using KernLayout = _kern_layout; \ + using OutputLayout = _output_layout; \ + using Param = _conv_param; \ + static constexpr bool check_bounds = check_bounds_; +#define MEGDNN_COMMA , + +template +struct IBatchConvTrait_f1x1s1x1 { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + struct DataTileCount { + using RegBlockConfig = RegBlockConfig; + using ThreadConfig = ThreadConfig; + using copy_t = src_ldg_dtype; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + static int constexpr skew = load_width; + static int constexpr block_tile_batch = RegBlockConfig::reg_n; + MEGDNN_STATIC_ASSERT( + block_tile_batch == 1, + "this algorithm does not unroll on batch dimension"); + static int constexpr block_tile_out_height_width = + RegBlockConfig::reg_width * ThreadConfig::nr_thread_x; + static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; + + static int constexpr smem_load_x = + block_tile_out_height_width / load_width; + static int constexpr load_x = + smem_load_x > WARP_SIZE ? WARP_SIZE : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + static int constexpr smem_h = RegBlockConfig::reg_k_packed; + static int constexpr smem_w = block_tile_out_height_width; + static int constexpr smem_stride = + smem_w % 2 == 0 ? smem_w + skew : smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + struct FilterTileCount { + using RegBlockConfig = RegBlockConfig; + using ThreadConfig = ThreadConfig; + using copy_t = filter_ldg_dtype; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(filter_dtype); + static int constexpr skew = load_width; + static int constexpr block_tile_out_channel = + RegBlockConfig::reg_m * ThreadConfig::nr_thread_y; + static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; + + static int constexpr smem_load_x = + RegBlockConfig::reg_k_packed / load_width; + static int constexpr load_x = + smem_load_x > WARP_SIZE ? WARP_SIZE : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + static int constexpr smem_h = block_tile_out_channel; + static int constexpr smem_w = RegBlockConfig::reg_k_packed; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + using BlockTileIterator = + BlockTileIterator_COxHW; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitor_CIxHW; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitor_COxCI; + static bool constexpr pipelined = RegBlockConfig::reg_k_packed > 1; + using BlockConsumer = + IConvBlockConsumer_COxHW; + using GlobalMemoryWriter = + IConvGlobalMemoryWriter_COxHW; +}; + +template +struct IBatchConvTrait { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + struct DataTileCount { + using RegBlockConfig = RegBlockConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int32_t; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = 4; + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + static int constexpr skew = load_width; + static int constexpr block_tile_batch = RegBlockConfig::reg_n; + MEGDNN_STATIC_ASSERT( + block_tile_batch == 1, + "this algorithm does not unroll on batch dimension"); + static int constexpr block_tile_out_height_width = + RegBlockConfig::reg_width * ThreadConfig::nr_thread_x; + static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; + + static int constexpr smem_load_x = + DIVUP(block_tile_out_height_width, load_width); + static int constexpr load_x = + smem_load_x > WARP_SIZE ? WARP_SIZE : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + static int constexpr smem_h = RegBlockConfig::reg_k_packed; + static int constexpr smem_w = smem_load_x * load_width; + static int constexpr smem_stride = + smem_w % 2 == 0 ? smem_w + skew : smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + static int constexpr reg_d = load_width; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + using FilterTileCount = + typename IBatchConvTrait_f1x1s1x1::FilterTileCount; + + using BlockTileIterator = + BlockTileIterator_COxHW; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitor_CIxHW; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitor_COxCI; + static bool constexpr pipelined = RegBlockConfig::reg_k_packed > 1; + using BlockConsumer = + IConvBlockConsumer_COxHW; + using GlobalMemoryWriter = + IConvGlobalMemoryWriter_COxHW; +}; + +#undef COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM +#undef MEGDNN_COMMA + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh new file mode 100644 index 00000000..bc86c123 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh @@ -0,0 +1,480 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh" +#include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh" +#include "src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +#define COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM( \ + _src_dtype, _filter_dtype, _smem_storage_dtype, _input_layout, \ + _kern_layout, _output_layout, _conv_param) \ + using src_dtype = _src_dtype; \ + using filter_dtype = _filter_dtype; \ + using smem_storage_dtype = _smem_storage_dtype; \ + using InputLayout = _input_layout; \ + using KernLayout = _kern_layout; \ + using OutputLayout = _output_layout; \ + using Param = _conv_param; \ + static constexpr bool check_bounds = check_bounds_; +#define MEGDNN_COMMA , + +template +struct IConvIMMATrait { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + struct DataTileCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int32_t; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + static int constexpr block_tile_batch = WarpTileConfig::warp_tile_n * + IMMAConfig::wmma_n * + ThreadConfig::nr_warp_x; + static int constexpr block_tile_in_channel = + WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k; + + static int constexpr smem_load_x = block_tile_batch / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + // smem col major + static int constexpr smem_h = WarpTileConfig::warp_tile_k; + static int constexpr smem_w = IMMAConfig::tile_b_sizes_int * + WarpTileConfig::warp_tile_n * + ThreadConfig::nr_warp_x; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + static int constexpr reg_d = + IMMAConfig::wmma_k / WarpTileConfig::pack_size; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + struct FilterTileCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int32_t; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(filter_dtype); + static int constexpr block_tile_out_channel = + WarpTileConfig::warp_tile_m * IMMAConfig::wmma_m * + ThreadConfig::nr_warp_y; + static int constexpr block_tile_in_channel = + WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k; + + static int constexpr smem_load_x = block_tile_out_channel / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + // smem col major + static int constexpr smem_h = WarpTileConfig::warp_tile_k; + static int constexpr smem_w = IMMAConfig::tile_a_sizes_int * + WarpTileConfig::warp_tile_m * + ThreadConfig::nr_warp_y; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + static int constexpr reg_d = + IMMAConfig::wmma_k / WarpTileConfig::pack_size; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + struct GlobalMemoryStoreCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int4; + static int constexpr smem_h = ThreadConfig::nr_warp_y; + static int constexpr smem_w = + (WarpTileConfig::warp_tile_n & 0x1) + ? ThreadConfig::nr_warp_x * IMMAConfig::wmma_m * + IMMAConfig::wmma_n + : 2 * ThreadConfig::nr_warp_x * IMMAConfig::wmma_m * + IMMAConfig::wmma_n; + static int constexpr store_width = sizeof(copy_t) / sizeof(int32_t); + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr store_x = + (WarpTileConfig::warp_tile_n & 0x1) + ? IMMAConfig::wmma_n / store_width + : 2 * IMMAConfig::wmma_n / store_width; + static int constexpr store_y = ThreadConfig::warp_size / store_x; + }; + + using BlockTileIterator = + BlockTileIteratorBasic; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxN; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxN; + static bool constexpr pipelined = WarpTileConfig::warp_tile_k > 1; + using BlockConsumer = IConvIMMABlockConsumer; + using GlobalMemoryWriter = + IConvIMMAGlobalMemoryWriter; +}; + +template +struct IConvIMMATraitReorderFilter { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + MEGDNN_STATIC_ASSERT( + std::is_same:: + src_dtype MEGDNN_COMMA src_dtype>::value == + true, + "data type of input tensor should be int8_t"); + using DataTileCount = + typename IConvIMMATrait::DataTileCount; + struct FilterTileCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int4; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(filter_dtype); + static int constexpr block_tile_out_channel = + WarpTileConfig::warp_tile_m * IMMAConfig::wmma_m * + ThreadConfig::nr_warp_y; + static int constexpr block_tile_in_channel = + WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k; + + static int constexpr smem_load_x = block_tile_out_channel; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + // smem col major + static int constexpr smem_h = WarpTileConfig::warp_tile_k; + static int constexpr smem_w = IMMAConfig::tile_a_sizes_int * + WarpTileConfig::warp_tile_m * + ThreadConfig::nr_warp_y; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + using BlockTileIterator = + BlockTileIteratorBasic; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxN; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxN; + static bool constexpr pipelined = WarpTileConfig::warp_tile_k > 1; + using BlockConsumer = IConvIMMABlockConsumer; + using GlobalMemoryStoreCount = + typename IConvIMMATrait::GlobalMemoryStoreCount; + using GlobalMemoryWriter = + IConvIMMAGlobalMemoryWriter; +}; + +template +struct IConvIMMATraitUnrollWidth { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + + struct DataTileCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int4; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + + static int constexpr block_tile_batch = IMMAConfig::wmma_n; + static int constexpr block_tile_out_width = + WarpTileConfig::warp_tile_n * ThreadConfig::nr_warp_x; + static int constexpr block_tile_in_channel = + WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k; + + static int constexpr smem_load_x = + block_tile_batch * block_tile_out_width / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + // smem col major + static int constexpr smem_h = WarpTileConfig::warp_tile_k; + static int constexpr smem_w = + IMMAConfig::tile_b_sizes_int * block_tile_out_width; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + static int constexpr reg_d = + IMMAConfig::wmma_k / WarpTileConfig::pack_size; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + MEGDNN_STATIC_ASSERT( + std::is_same::filter_dtype + MEGDNN_COMMA filter_dtype>::value == true, + "data type of filter tensor should be int8_t"); + using FilterTileCount = + typename IConvIMMATraitReorderFilter::FilterTileCount; + using BlockTileIterator = + BlockTileIteratorUnrollWidth; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxWOxN; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxN; + static bool constexpr pipelined = WarpTileConfig::warp_tile_k > 1; + using BlockConsumer = IConvIMMABlockConsumer; + + struct GlobalMemoryStoreCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using copy_t = int4; + static int constexpr smem_h = ThreadConfig::nr_warp_y; + static int constexpr consecutive_width_tile = + !(WarpTileConfig::warp_tile_n & 0x1); + static int constexpr smem_w = + consecutive_width_tile + ? 2 * ThreadConfig::nr_warp_x * IMMAConfig::wmma_m * + IMMAConfig::wmma_n + : ThreadConfig::nr_warp_x * IMMAConfig::wmma_m * + IMMAConfig::wmma_n; + + static int constexpr store_width = sizeof(copy_t) / sizeof(int32_t); + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr store_x = + consecutive_width_tile ? 2 * IMMAConfig::wmma_n / store_width + : IMMAConfig::wmma_n / store_width; + static int constexpr store_y = ThreadConfig::warp_size / store_x; + }; + using GlobalMemoryWriter = + IConvIMMAGlobalMemoryWriterUnrollWidth; +}; + +template +struct IConvIMMATraitUnrollWidthV2 { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using Conv1dConfig = Conv1dConfig_; + using IMMAConfig = IMMAConfig_; + using WarpTileConfig = WarpTileConfig_; + using ThreadConfig = ThreadConfig_; + + struct DataTileCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using Conv1dConfig = Conv1dConfig; + + MEGDNN_STATIC_ASSERT(WarpTileConfig::warp_tile_k == 1, + "kernel unrolling along width axis assumes tile k " + "in warp-level must be 1"); + using copy_t = int4; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + + static int constexpr block_tile_out_width = + WarpTileConfig::warp_tile_n * ThreadConfig::nr_warp_x; + static int constexpr block_tile_in_width = + (WarpTileConfig::warp_tile_n * ThreadConfig::nr_warp_x - 1) * + Conv1dConfig::sw + + Conv1dConfig::fw; + static int constexpr block_tile_batch = IMMAConfig::wmma_n; + static int constexpr block_tile_in_channel = + WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k; + + static int constexpr smem_load_x = block_tile_batch / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + // smem col major + static int constexpr smem_h = + WarpTileConfig::warp_tile_k * block_tile_in_width; + static int constexpr smem_w = IMMAConfig::tile_b_sizes_int; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + static int constexpr reg_d = + IMMAConfig::wmma_k / WarpTileConfig::pack_size; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + struct FilterTileCount { + using IMMAConfig = IMMAConfig; + using WarpTileConfig = WarpTileConfig; + using ThreadConfig = ThreadConfig; + using Conv1dConfig = Conv1dConfig; + + MEGDNN_STATIC_ASSERT(WarpTileConfig::warp_tile_k == 1, + "kernel unrolling along width axis assumes tile k " + "in warp-level must be 1"); + using copy_t = int4; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(filter_dtype); + static int constexpr block_tile_out_channel = + WarpTileConfig::warp_tile_m * IMMAConfig::wmma_m * + ThreadConfig::nr_warp_y; + static int constexpr block_tile_in_channel = + WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k; + + static int constexpr smem_load_x = block_tile_out_channel; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + // smem col major + static int constexpr smem_h = Conv1dConfig::fw; + static int constexpr smem_w = IMMAConfig::tile_a_sizes_int * + WarpTileConfig::warp_tile_m * + ThreadConfig::nr_warp_y; + static int constexpr smem_stride = smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + using BlockTileIterator = + BlockTileIteratorUnrollWidthV2; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_CIxWIxN; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitorIMMA_FWxCO; + using BlockConsumer = + IConvIMMABlockConsumerUnrollWidth; + using GlobalMemoryStoreCount = typename IConvIMMATraitUnrollWidth< + check_bounds, IMMAConfig, WarpTileConfig, + ThreadConfig>::GlobalMemoryStoreCount; + using GlobalMemoryWriter = + IConvIMMAGlobalMemoryWriterUnrollWidth; +}; +#undef COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM +#undef MEGDNN_COMMA + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh new file mode 100644 index 00000000..9493d584 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh @@ -0,0 +1,219 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh" +#include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh" +#include "src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh" +#include "src/cuda/convolution_helper/layout.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +#define COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM( \ + _src_dtype, _filter_dtype, _smem_storage_dtype, _input_layout, \ + _kern_layout, _output_layout, _conv_param) \ + using src_dtype = _src_dtype; \ + using filter_dtype = _filter_dtype; \ + using smem_storage_dtype = _smem_storage_dtype; \ + using InputLayout = _input_layout; \ + using KernLayout = _kern_layout; \ + using OutputLayout = _output_layout; \ + using Param = _conv_param; \ + static constexpr bool check_bounds = check_bounds_; +#define MEGDNN_COMMA , + +template +struct IConvTrait { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + struct DataTileCount { + using RegBlockConfig = RegBlockConfig; + using ThreadConfig = ThreadConfig; + using copy_t = ldg_dtype; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + static int constexpr skew = load_width; + static int constexpr block_tile_batch = + RegBlockConfig::reg_n * ThreadConfig::nr_thread_x; + static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; + + static int constexpr smem_load_x = block_tile_batch / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + static int constexpr smem_h = RegBlockConfig::reg_k_packed; + static int constexpr smem_w = block_tile_batch; + static int constexpr smem_stride = + smem_w % 2 == 0 ? smem_w + skew : smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + struct FilterTileCount { + using RegBlockConfig = RegBlockConfig; + using ThreadConfig = ThreadConfig; + using copy_t = ldg_dtype; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(filter_dtype); + static int constexpr skew = load_width; + static int constexpr block_tile_out_channel = + RegBlockConfig::reg_m * ThreadConfig::nr_thread_y; + static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; + + static int constexpr smem_load_x = block_tile_out_channel / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + static int constexpr smem_h = RegBlockConfig::reg_k_packed; + static int constexpr smem_w = block_tile_out_channel; + static int constexpr smem_stride = + smem_w % 2 == 0 ? smem_w + skew : smem_w; + static int constexpr smem_tot = smem_h * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + + using BlockTileIterator = + BlockTileIteratorBasic; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitor_CIxN; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitor_CIxN; + static bool constexpr pipelined = RegBlockConfig::reg_k_packed > 1; + using BlockConsumer = + IConvBlockConsumer; + using GlobalMemoryWriter = + IConvGlobalMemoryWriter; +}; + +template +struct IConvTraitUnrollWidth { + COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, + Layout, + Layout, + Layout, + ConvParam); + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + struct DataTileCount { + using RegBlockConfig = RegBlockConfig; + using ThreadConfig = ThreadConfig; + using copy_t = ldg_dtype; + using smem_storage_dtype = smem_storage_dtype; + static int constexpr load_width = + sizeof(copy_t) / sizeof(smem_storage_dtype); + static int constexpr ldg_load_width = + sizeof(copy_t) / sizeof(src_dtype); + static int constexpr skew = load_width; + static int constexpr block_tile_batch = + RegBlockConfig::reg_n * ThreadConfig::nr_thread_x; + static int constexpr block_tile_out_width = RegBlockConfig::reg_width; + static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; + + static int constexpr smem_load_x = block_tile_batch / load_width; + static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; + static int constexpr load_y = ThreadConfig::nr_threads / load_x; + + static int constexpr smem_h = RegBlockConfig::reg_k_packed; + static int constexpr smem_w = block_tile_batch; + static int constexpr img_cache = RegBlockConfig::reg_width; + static int constexpr smem_stride = + smem_w % 2 == 0 ? smem_w + skew : smem_w; + static int constexpr smem_tot = smem_h * img_cache * smem_stride; + + static int constexpr reg_h = (smem_h + load_y - 1) / load_y; + static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; + + static bool constexpr check_bounds_h = smem_h % load_y != 0; + static bool constexpr check_bounds_w = smem_load_x % load_x != 0; + }; + MEGDNN_STATIC_ASSERT( + std::is_same::filter_dtype + MEGDNN_COMMA filter_dtype>::value == true, + "data type of filter tensor should be int8_t"); + using FilterTileCount = + typename IConvTrait::FilterTileCount; + using BlockTileIterator = + BlockTileIteratorUnrollWidth; + using DataGlobal2ShareMemVisitor = + Global2ShareMemVisitor_CIxWOxN; + using FilterGlobal2ShareMemVisitor = + Global2ShareMemVisitor_CIxN; + static bool constexpr pipelined = RegBlockConfig::reg_k_packed > 1; + using BlockConsumer = + IConvBlockConsumerUnrollWidth; + using GlobalMemoryWriter = + IConvGlobalMemoryWriterUnrollWidth; +}; + +#undef COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM +#undef MEGDNN_COMMA + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/epilogue.cuh b/dnn/src/cuda/convolution_helper/epilogue.cuh new file mode 100644 index 00000000..2fc65687 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/epilogue.cuh @@ -0,0 +1,218 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/epilogue.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/activation.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +template +struct IConvEpilogue { + int8_t* __restrict__ dst; + const int8_t* __restrict__ z; + int batch_stride; + int channel_stride; + int height_stride; + int width_stride; + float gamma; + ActivationOp act; + MEGDNN_HOST MEGDNN_DEVICE IConvEpilogue(int8_t* __restrict__ dst, + const int8_t* __restrict__ z, + int batch_stride, + int channel_stride, + int height_stride, int width_stride, + float gamma, ActivationOp act) + : dst{dst}, + z{z}, + batch_stride{batch_stride}, + channel_stride{channel_stride}, + height_stride{height_stride}, + width_stride{width_stride}, + gamma{gamma}, + act{act} {} +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void move(const int b_idx, const int ch_idx, + const int h_idx, const int w_idx) { + size_t offset = b_idx * batch_stride + ch_idx * channel_stride + + h_idx * height_stride + w_idx * width_stride; + dst += offset; + if (z != nullptr) + z += offset; + } + __device__ __forceinline__ void apply(float alpha, float4 f_conv, + float beta, float4 f_bias, + const int b_idx, const int ch_idx, + const int h_idx, const int w_idx) { + size_t idx = b_idx * batch_stride + ch_idx * channel_stride + + h_idx * height_stride + w_idx * width_stride; + float4 f_res = alpha * f_conv + beta * f_bias; + if (z != nullptr) { + int i_z = __ldg(reinterpret_cast(&z[idx])); + float4 f_z = transform_int8x4_to_float4(i_z); + f_res = f_res + gamma * f_z; + } + *(reinterpret_cast(&dst[idx])) = + act.apply_and_transform(f_res); + } + __device__ __forceinline__ void apply(float alpha, float4 f_conv, + float beta, float4 f_bias, + const int b_idx, const int ch_idx, + const int hw_idx) { + size_t idx = b_idx * batch_stride + ch_idx * channel_stride + + hw_idx * width_stride; + float4 f_res = alpha * f_conv + beta * f_bias; + if (z != nullptr) { + int i_z = __ldg(reinterpret_cast(&z[idx])); + float4 f_z = transform_int8x4_to_float4(i_z); + f_res = f_res + gamma * f_z; + } + *(reinterpret_cast(&dst[idx])) = + act.apply_and_transform(f_res); + } + __device__ __forceinline__ void apply(float alpha, float4 f_conv_x, + float4 f_conv_y, float beta, + float4 f_bias_x, float4 f_bias_y, + const int b_idx, const int ch_idx, + const int h_idx, const int w_idx) { + size_t idx = b_idx * batch_stride + ch_idx * channel_stride + + h_idx * height_stride + w_idx * width_stride; + float4 f_res_x = alpha * f_conv_x + beta * f_bias_x; + float4 f_res_y = alpha * f_conv_y + beta * f_bias_y; + if (z != nullptr) { + int2 i_z2 = __ldg(reinterpret_cast(&z[idx])); + float4 f_z_x = transform_int8x4_to_float4(i_z2.x); + float4 f_z_y = transform_int8x4_to_float4(i_z2.y); + f_res_x = f_res_x + gamma * f_z_x; + f_res_y = f_res_y + gamma * f_z_y; + } + int ix = act.apply_and_transform(f_res_x); + int iy = act.apply_and_transform(f_res_y); + *(reinterpret_cast(&dst[idx])) = ::make_int2(ix, iy); + } + __device__ __forceinline__ void apply(float alpha, float4 f_conv_x, + float4 f_conv_y, float beta, + float4 f_bias_x, float4 f_bias_y, + const int b_idx, const int ch_idx, + const int hw_idx) { + size_t idx = b_idx * batch_stride + ch_idx * channel_stride + + hw_idx * width_stride; + float4 f_res_x = alpha * f_conv_x + beta * f_bias_x; + float4 f_res_y = alpha * f_conv_y + beta * f_bias_y; + if (z != nullptr) { + int2 i_z2 = __ldg(reinterpret_cast(&z[idx])); + float4 f_z_x = transform_int8x4_to_float4(i_z2.x); + float4 f_z_y = transform_int8x4_to_float4(i_z2.y); + f_res_x = f_res_x + gamma * f_z_x; + f_res_y = f_res_y + gamma * f_z_y; + } + int ix = act.apply_and_transform(f_res_x); + int iy = act.apply_and_transform(f_res_y); + *(reinterpret_cast(&dst[idx])) = ::make_int2(ix, iy); + } + + __device__ __forceinline__ void apply(float alpha, float4 f_conv_x, + float4 f_conv_y, float4 f_conv_z, + float4 f_conv_w, float beta, + float4 f_bias_x, float4 f_bias_y, + float4 f_bias_z, float4 f_bias_w, + const int b_idx, const int ch_idx, + const int h_idx, const int w_idx) { + size_t idx = b_idx * batch_stride + ch_idx * channel_stride + + h_idx * height_stride + w_idx * width_stride; + float4 f_res_x = alpha * f_conv_x + beta * f_bias_x; + float4 f_res_y = alpha * f_conv_y + beta * f_bias_y; + float4 f_res_z = alpha * f_conv_z + beta * f_bias_z; + float4 f_res_w = alpha * f_conv_w + beta * f_bias_w; + if (z != nullptr) { + int4 i_z4 = __ldg(reinterpret_cast(&z[idx])); + + float4 f_z_x = transform_int8x4_to_float4(i_z4.x); + float4 f_z_y = transform_int8x4_to_float4(i_z4.y); + float4 f_z_z = transform_int8x4_to_float4(i_z4.z); + float4 f_z_w = transform_int8x4_to_float4(i_z4.w); + + f_res_x = f_res_x + gamma * f_z_x; + f_res_y = f_res_y + gamma * f_z_y; + f_res_z = f_res_z + gamma * f_z_z; + f_res_w = f_res_w + gamma * f_z_w; + } + int ix = act.apply_and_transform(f_res_x); + int iy = act.apply_and_transform(f_res_y); + int iz = act.apply_and_transform(f_res_z); + int iw = act.apply_and_transform(f_res_w); + *(reinterpret_cast(&dst[idx])) = ::make_int4(ix, iy, iz, iw); + } + __device__ __forceinline__ void apply(float alpha, float4 f_conv_x, + float4 f_conv_y, float4 f_conv_z, + float4 f_conv_w, float beta, + float4 f_bias_x, float4 f_bias_y, + float4 f_bias_z, float4 f_bias_w, + const int b_idx, const int ch_idx, + const int hw_idx) { + size_t idx = b_idx * batch_stride + ch_idx * channel_stride + + hw_idx * width_stride; + float4 f_res_x = alpha * f_conv_x + beta * f_bias_x; + float4 f_res_y = alpha * f_conv_y + beta * f_bias_y; + float4 f_res_z = alpha * f_conv_z + beta * f_bias_z; + float4 f_res_w = alpha * f_conv_w + beta * f_bias_w; + if (z != nullptr) { + int4 i_z4 = __ldg(reinterpret_cast(&z[idx])); + + float4 f_z_x = transform_int8x4_to_float4(i_z4.x); + float4 f_z_y = transform_int8x4_to_float4(i_z4.y); + float4 f_z_z = transform_int8x4_to_float4(i_z4.z); + float4 f_z_w = transform_int8x4_to_float4(i_z4.w); + + f_res_x = f_res_x + gamma * f_z_x; + f_res_y = f_res_y + gamma * f_z_y; + f_res_z = f_res_z + gamma * f_z_z; + f_res_w = f_res_w + gamma * f_z_w; + } + int ix = act.apply_and_transform(f_res_x); + int iy = act.apply_and_transform(f_res_y); + int iz = act.apply_and_transform(f_res_z); + int iw = act.apply_and_transform(f_res_w); + *(reinterpret_cast(&dst[idx])) = ::make_int4(ix, iy, iz, iw); + } +#endif +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh new file mode 100644 index 00000000..7797507c --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh @@ -0,0 +1,45 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixn.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixwoxn.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh" +//#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_small_channel.cuh" +//#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_with_img_cache.cuh" + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh new file mode 100644 index 00000000..6c23dfd3 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh @@ -0,0 +1,300 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/arch.h" +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct Global2ShareMemVisitorBase_CIxHW { + using TileCount = TileCount_; + using copy_t = typename TileCount::copy_t; + using smem_storage_dtype = typename TileCount::smem_storage_dtype; + + using RegBlockConfig = typename TileCount::RegBlockConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + + const copy_t* __restrict__ g_ptr; + int stride; + smem_storage_dtype* smem; + + __device__ Global2ShareMemVisitorBase_CIxHW(smem_storage_dtype* smem_) + : smem{smem_} {} + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ void move_forward() { + g_ptr += RegBlockConfig::reg_k_packed * stride; + } +}; + +template +struct Global2ShareMemVisitor_CIxHW; + +#define DEF(_precomp_offset, _Layout) \ + template \ + struct Global2ShareMemVisitor_CIxHW \ + : public Global2ShareMemVisitorBase_CIxHW { \ + using Base = Global2ShareMemVisitorBase_CIxHW; \ + using TileCount = typename Base::TileCount; \ + using copy_t = typename Base::copy_t; \ + using smem_storage_dtype = typename Base::smem_storage_dtype; \ + using RegBlockConfig = typename TileCount::RegBlockConfig; \ + using ThreadConfig = typename TileCount::ThreadConfig; \ + using Base::g_ptr; \ + using Base::stride; \ + using Base::smem; \ + using Base::sh_ptr_as_copy_t; \ + static constexpr int load_width = TileCount::load_width; \ + static constexpr bool precomp_offset = _precomp_offset; \ + \ + const int tidx = threadIdx.x; \ + const int tidy = threadIdx.y; \ + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; \ + const int gl_load_y = tid / TileCount::load_x; \ + const int gl_load_x = tid - gl_load_y * TileCount::load_x; \ + \ + const int* __restrict__ offset; \ + int remain; + +DEF(true, Layout) + + copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d]; + MEGDNN_STATIC_ASSERT(load_width == 4, + "load four element from src tensor per time"); + + __device__ Global2ShareMemVisitor_CIxHW(smem_storage_dtype* smem_, + const int* __restrict__ offset_) + : Base{smem_}, offset{offset_} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int out_offset = w_idx * load_width; + int4 in_offset = + *reinterpret_cast(&offset[out_offset]); + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + copy_t ix = make_zero(); + copy_t iy = ix; + copy_t iz = ix; + copy_t iw = ix; + if (in_offset.x >= 0) { + ix = g_ptr[h_idx * stride + in_offset.x]; + } + if (in_offset.y >= 0) { + iy = g_ptr[h_idx * stride + in_offset.y]; + } + if (in_offset.z >= 0) { + iz = g_ptr[h_idx * stride + in_offset.z]; + } + if (in_offset.w >= 0) { + iw = g_ptr[h_idx * stride + in_offset.w]; + } + *(sh_ptr_as_copy_t(h_idx, out_offset + 0)) = ix; + *(sh_ptr_as_copy_t(h_idx, out_offset + 1)) = iy; + *(sh_ptr_as_copy_t(h_idx, out_offset + 2)) = iz; + *(sh_ptr_as_copy_t(h_idx, out_offset + 3)) = iw; + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int out_offset = w_idx * load_width; + int4 in_offset = + *reinterpret_cast(&offset[out_offset]); + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + copy_t ix = make_zero(); + copy_t iy = ix; + copy_t iz = ix; + copy_t iw = ix; + if (in_offset.x >= 0) { + ix = g_ptr[h_idx * stride + in_offset.x]; + } + if (in_offset.y >= 0) { + iy = g_ptr[h_idx * stride + in_offset.y]; + } + if (in_offset.z >= 0) { + iz = g_ptr[h_idx * stride + in_offset.z]; + } + if (in_offset.w >= 0) { + iw = g_ptr[h_idx * stride + in_offset.w]; + } + reg[i][j][0] = ix; + reg[i][j][1] = iy; + reg[i][j][2] = iz; + reg[i][j][3] = iw; + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int out_offset = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, out_offset + 0)) = reg[i][j][0]; + *(sh_ptr_as_copy_t(h_idx, out_offset + 1)) = reg[i][j][1]; + *(sh_ptr_as_copy_t(h_idx, out_offset + 2)) = reg[i][j][2]; + *(sh_ptr_as_copy_t(h_idx, out_offset + 3)) = reg[i][j][3]; + } + } + } +}; + +DEF(false, Layout) + + copy_t reg[TileCount::reg_h][TileCount::reg_w]; + __device__ Global2ShareMemVisitor_CIxHW(smem_storage_dtype* smem_) + : Base{smem_} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int spatial = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (spatial < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, spatial)) = val; + } else { + *(sh_ptr_as_copy_t(h_idx, spatial)) = + g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int spatial = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (spatial < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = reg[i][j]; + } + } + } +}; + +#undef DEF + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixn.cuh new file mode 100644 index 00000000..9744daf1 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixn.cuh @@ -0,0 +1,151 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixn.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +template +struct Global2ShareMemVisitor_CIxN; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitor_CIxN, Layout) + using RegBlockConfig = typename TileCount::RegBlockConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w]; + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int batch = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (batch < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, batch)) = val; + } else { + *(sh_ptr_as_copy_t(h_idx, batch)) = + g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int batch = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (batch < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ void move_forward() { + g_ptr += RegBlockConfig::reg_k_packed * stride; + } +}; + +} // namespace cuda +} // namespace megdnn +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixwoxn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixwoxn.cuh new file mode 100644 index 00000000..8fdff7c8 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixwoxn.cuh @@ -0,0 +1,187 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixwoxn.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +template +struct Global2ShareMemVisitor_CIxWOxN; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitor_CIxWOxN, + Layout) + using RegBlockConfig = typename TileCount::RegBlockConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int sw; + int stride; + int remain; + int img_stride; + int img_start; + int img_end; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::img_cache][TileCount::reg_w]; + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + img_stride = layout.width_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::img_cache; ++j) { + int jstride = j * sw; +#pragma unroll + for (int k = 0; k < TileCount::reg_w; ++k) { + int w_idx = gl_load_x + k * TileCount::load_x; + int batch = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (jstride >= img_start && jstride < img_end && + batch < remain) { + val = g_ptr[h_idx * stride + jstride * img_stride + + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, j, batch)) = val; + } else { + copy_t val = make_zero(); + if (jstride >= img_start && jstride < img_end) { + val = g_ptr[h_idx * stride + jstride * img_stride + + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, j, batch)) = val; + } + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::img_cache; ++j) { + int jstride = j * sw; +#pragma unroll + for (int k = 0; k < TileCount::reg_w; ++k) { + int w_idx = gl_load_x + k * TileCount::load_x; + int batch = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (jstride >= img_start && jstride < img_end && + batch < remain) { + val = g_ptr[h_idx * stride + jstride * img_stride + + w_idx]; + } + reg[i][j][k] = val; + } else { + copy_t val = make_zero(); + if (jstride >= img_start && jstride < img_end) { + val = g_ptr[h_idx * stride + jstride * img_stride + + w_idx]; + } + reg[i][j][k] = val; + } + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::img_cache; ++j) { +#pragma unroll + for (int k = 0; k < TileCount::reg_w; ++k) { + int w_idx = gl_load_x + k * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, j, w_idx * load_width)) = + reg[i][j][k]; + } + } + } + } + + __device__ __forceinline__ int32_t* sh_ptr(int z, int y, int x) { + return &smem[(z * TileCount::img_cache + y) * TileCount::smem_stride + + x]; + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int z, int y, int x) { + return reinterpret_cast(sh_ptr(z, y, x)); + } + + __device__ __forceinline__ void move_forward() { + g_ptr += RegBlockConfig::reg_k_packed * stride; + } + + __device__ __forceinline__ void set_range(const int start, const int end) { + img_start = start, img_end = end; + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh new file mode 100644 index 00000000..2443b317 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh @@ -0,0 +1,74 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +template +__device__ __forceinline__ static T make_zero(); + +template <> +__device__ __forceinline__ int32_t make_zero() { + return 0; +} + +template <> +__device__ __forceinline__ int2 make_zero() { + return ::make_int2(0, 0); +} + +template <> +__device__ __forceinline__ int4 make_zero() { + return ::make_int4(0, 0, 0, 0); +} + +#define DEF_GLOBAL_MEMORY_VISITOR(_cls, _Layout) \ + template \ + struct _cls { \ + using TileCount = TileCount_; \ + using copy_t = typename TileCount::copy_t; \ + using smem_storage_dtype = typename TileCount::smem_storage_dtype; \ + const copy_t* __restrict__ g_ptr; \ + smem_storage_dtype* smem; \ + static constexpr int load_width = TileCount::load_width; \ + __device__ _cls(smem_storage_dtype* smem_) : smem{smem_} {} + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_coxci.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_coxci.cuh new file mode 100644 index 00000000..c592f37a --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_coxci.cuh @@ -0,0 +1,149 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_coxci.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +template +struct Global2ShareMemVisitor_COxCI; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitor_COxCI, Layout) + using RegBlockConfig = typename TileCount::RegBlockConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w]; + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.batch_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (h_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = val; + } else { + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = + g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (h_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ void move_forward() { + g_ptr += RegBlockConfig::reg_k_packed / load_width; + } +}; + +} // namespace cuda +} // namespace megdnn +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh new file mode 100644 index 00000000..95b42699 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh @@ -0,0 +1,267 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +#define MEGDNN_COMMA , +template +struct Global2ShareMemVisitorIMMA_CIxN; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitorIMMA_CIxN, + Layout) + using IMMAConfig = typename TileCount::IMMAConfig; + using WarpTileConfig = typename TileCount::WarpTileConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d]; + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int batch = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; +#pragma unroll + for (int k = 0; k < TileCount::reg_d; ++k) { + int channel = ((h_idx * TileCount::reg_d + k)); + if (check_bounds) { + copy_t val = make_zero(); + if (batch < remain) { + val = g_ptr[channel * stride + w_idx]; + } + *(sh_ptr(h_idx, batch * TileCount::reg_d + k)) = val; + } else { + *(sh_ptr(h_idx, batch * TileCount::reg_d + k)) = + g_ptr[channel * stride + w_idx]; + } + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int batch = w_idx * load_width; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; +#pragma unroll + for (int k = 0; k < TileCount::reg_d; ++k) { + int channel = (h_idx * TileCount::reg_d + k); + if (check_bounds) { + copy_t val = make_zero(); + if (batch < remain) { + val = g_ptr[channel * stride + w_idx]; + } + reg[i][j][k] = val; + } else { + reg[i][j][k] = g_ptr[channel * stride + w_idx]; + } + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; +#pragma unroll + for (int k = 0; k < TileCount::reg_d; ++k) { + *(sh_ptr(h_idx, w_idx * load_width * TileCount::reg_d + + k)) = reg[i][j][k]; + } + } + } + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k / 4 * stride; + } +}; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitorIMMA_CIxN, + Layout) + using IMMAConfig = typename TileCount::IMMAConfig; + using WarpTileConfig = typename TileCount::WarpTileConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w]; + MEGDNN_STATIC_ASSERT(std::is_same::value == true, + "ldg data type must be int4 for this memory visitor"); + + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = val; + } else { + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = + g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ void move_forward() { + g_ptr += WarpTileConfig::warp_tile_k * stride; + } +}; +#undef MEGDNN_COMMA + +} // namespace cuda +} // namespace megdnn +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh new file mode 100644 index 00000000..2008e6b9 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh @@ -0,0 +1,221 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +#define MEGDNN_COMMA , +template +struct Global2ShareMemVisitorIMMA_CIxWIxN; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitorIMMA_CIxWIxN, + Layout) + using IMMAConfig = typename TileCount::IMMAConfig; + using WarpTileConfig = typename TileCount::WarpTileConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + int width_stride; + int width_start; + int width_end; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d]; + MEGDNN_STATIC_ASSERT(std::is_same::value == true, + "ldg data type must be int4 for this memory visitor"); + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + width_stride = layout.width_stride / TileCount::ldg_load_width; + } + + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int batch = (w_idx << 2); + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (h_idx >= width_start && h_idx < width_end && + batch < remain) { + c0 = g_ptr[0 * stride + h_idx * width_stride + w_idx]; + c1 = g_ptr[1 * stride + h_idx * width_stride + w_idx]; + c2 = g_ptr[2 * stride + h_idx * width_stride + w_idx]; + c3 = g_ptr[3 * stride + h_idx * width_stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4))) = + make_int4(c0.x, c1.x, c2.x, c3.x); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 4)) = + make_int4(c0.y, c1.y, c2.y, c3.y); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 8)) = + make_int4(c0.z, c1.z, c2.z, c3.z); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 12)) = + make_int4(c0.w, c1.w, c2.w, c3.w); + } else { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (h_idx >= width_start && h_idx < width_end) { + c0 = g_ptr[0 * stride + h_idx * width_stride + w_idx]; + c1 = g_ptr[1 * stride + h_idx * width_stride + w_idx]; + c2 = g_ptr[2 * stride + h_idx * width_stride + w_idx]; + c3 = g_ptr[3 * stride + h_idx * width_stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4))) = + make_int4(c0.x, c1.x, c2.x, c3.x); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 4)) = + make_int4(c0.y, c1.y, c2.y, c3.y); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 8)) = + make_int4(c0.z, c1.z, c2.z, c3.z); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 12)) = + make_int4(c0.w, c1.w, c2.w, c3.w); + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + int batch = (w_idx << 2); + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (h_idx >= width_start && h_idx < width_end && + batch < remain) { + c0 = g_ptr[0 * stride + h_idx * width_stride + w_idx]; + c1 = g_ptr[1 * stride + h_idx * width_stride + w_idx]; + c2 = g_ptr[2 * stride + h_idx * width_stride + w_idx]; + c3 = g_ptr[3 * stride + h_idx * width_stride + w_idx]; + } + reg[i][j][0] = make_int4(c0.x, c1.x, c2.x, c3.x); + reg[i][j][1] = make_int4(c0.y, c1.y, c2.y, c3.y); + reg[i][j][2] = make_int4(c0.z, c1.z, c2.z, c3.z); + reg[i][j][3] = make_int4(c0.w, c1.w, c2.w, c3.w); + } else { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (h_idx >= width_start && h_idx < width_end) { + c0 = g_ptr[0 * stride + h_idx * width_stride + w_idx]; + c1 = g_ptr[1 * stride + h_idx * width_stride + w_idx]; + c2 = g_ptr[2 * stride + h_idx * width_stride + w_idx]; + c3 = g_ptr[3 * stride + h_idx * width_stride + w_idx]; + } + reg[i][j][0] = make_int4(c0.x, c1.x, c2.x, c3.x); + reg[i][j][1] = make_int4(c0.y, c1.y, c2.y, c3.y); + reg[i][j][2] = make_int4(c0.z, c1.z, c2.z, c3.z); + reg[i][j][3] = make_int4(c0.w, c1.w, c2.w, c3.w); + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4))) = reg[i][j][0]; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 4)) = reg[i][j][1]; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 8)) = reg[i][j][2]; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 12)) = reg[i][j][3]; + } + } + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k / 4 * stride; + } + + __device__ __forceinline__ void set_range(const int start, const int end) { + width_start = start, width_end = end; + } +}; +#undef MEGDNN_COMMA + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh new file mode 100644 index 00000000..b351538f --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh @@ -0,0 +1,245 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +#define MEGDNN_COMMA , +template +struct Global2ShareMemVisitorIMMA_CIxWOxN; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitorIMMA_CIxWOxN, + Layout) + using IMMAConfig = typename TileCount::IMMAConfig; + using WarpTileConfig = typename TileCount::WarpTileConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + int sw; + int width_stride; + int width_start; + int width_end; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d]; + MEGDNN_STATIC_ASSERT(std::is_same::value == true, + "ldg data type must be int4 for this memory visitor"); + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.channel_stride / TileCount::ldg_load_width; + width_stride = layout.width_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + int width = (w_idx >> (IMMAConfig::wmma_n_bit - 2)) * sw; + int batch = (w_idx & ((IMMAConfig::wmma_n >> 2) - 1)); + if (check_bounds) { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (width >= width_start && width < width_end && + (batch << 2) < remain) { + c0 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 0) * stride]; + c1 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 1) * stride]; + c2 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 2) * stride]; + c3 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 3) * stride]; + } + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4))) = + make_int4(c0.x, c1.x, c2.x, c3.x); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 4)) = + make_int4(c0.y, c1.y, c2.y, c3.y); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 8)) = + make_int4(c0.z, c1.z, c2.z, c3.z); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 12)) = + make_int4(c0.w, c1.w, c2.w, c3.w); + } else { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (width >= width_start && width < width_end) { + c0 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 0) * stride]; + c1 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 1) * stride]; + c2 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 2) * stride]; + c3 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 3) * stride]; + } + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4))) = + make_int4(c0.x, c1.x, c2.x, c3.x); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 4)) = + make_int4(c0.y, c1.y, c2.y, c3.y); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 8)) = + make_int4(c0.z, c1.z, c2.z, c3.z); + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 12)) = + make_int4(c0.w, c1.w, c2.w, c3.w); + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + int width = (w_idx >> (IMMAConfig::wmma_n_bit - 2)) * sw; + int batch = (w_idx & ((IMMAConfig::wmma_n >> 2) - 1)); + if (check_bounds) { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (width >= width_start && width < width_end && + (batch << 2) < remain) { + c0 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 0) * stride]; + c1 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 1) * stride]; + c2 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 2) * stride]; + c3 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 3) * stride]; + } + reg[i][j][0] = make_int4(c0.x, c1.x, c2.x, c3.x); + reg[i][j][1] = make_int4(c0.y, c1.y, c2.y, c3.y); + reg[i][j][2] = make_int4(c0.z, c1.z, c2.z, c3.z); + reg[i][j][3] = make_int4(c0.w, c1.w, c2.w, c3.w); + } else { + copy_t c0 = make_zero(); + copy_t c1 = make_zero(); + copy_t c2 = make_zero(); + copy_t c3 = make_zero(); + if (width >= width_start && width < width_end) { + c0 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 0) * stride]; + c1 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 1) * stride]; + c2 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 2) * stride]; + c3 = g_ptr[width * width_stride + batch + + ((h_idx << 2) + 3) * stride]; + } + reg[i][j][0] = make_int4(c0.x, c1.x, c2.x, c3.x); + reg[i][j][1] = make_int4(c0.y, c1.y, c2.y, c3.y); + reg[i][j][2] = make_int4(c0.z, c1.z, c2.z, c3.z); + reg[i][j][3] = make_int4(c0.w, c1.w, c2.w, c3.w); + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4))) = reg[i][j][0]; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 4)) = reg[i][j][1]; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 8)) = reg[i][j][2]; + *(sh_ptr_as_copy_t(h_idx, (w_idx << 4) + 12)) = reg[i][j][3]; + } + } + } + + template + __device__ __forceinline__ T* sh_ptr_as(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += WarpTileConfig::warp_tile_k * IMMAConfig::wmma_k / 4 * stride; + } + + __device__ __forceinline__ void set_range(const int start, const int end) { + width_start = start, width_end = end; + } +}; +#undef MEGDNN_COMMA + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh new file mode 100644 index 00000000..99ec2fd5 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh @@ -0,0 +1,157 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_common.cuh" +#include "src/cuda/convolution_helper/layout.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +#define MEGDNN_COMMA , +template +struct Global2ShareMemVisitorIMMA_FWxCO; + +DEF_GLOBAL_MEMORY_VISITOR(Global2ShareMemVisitorIMMA_FWxCO, + Layout) + using IMMAConfig = typename TileCount::IMMAConfig; + using WarpTileConfig = typename TileCount::WarpTileConfig; + using ThreadConfig = typename TileCount::ThreadConfig; + int stride; + int remain; + int ch_stride; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_h][TileCount::reg_w]; + MEGDNN_STATIC_ASSERT(std::is_same::value == true, + "ldg data type must be int4 for this memory visitor"); + + __device__ __forceinline__ void init_stride(Layout layout) { + stride = layout.width_stride / TileCount::ldg_load_width; + ch_stride = layout.channel_stride / TileCount::ldg_load_width; + } + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = val; + } else { + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = + g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + if (check_bounds) { + copy_t val = make_zero(); + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_h; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; +#pragma unroll + for (int j = 0; j < TileCount::reg_w; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_w && + w_idx >= TileCount::smem_load_x) + continue; + *(sh_ptr_as_copy_t(h_idx, w_idx * load_width)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ int32_t* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ copy_t* sh_ptr_as_copy_t(int y, int x) { + return reinterpret_cast(sh_ptr(y, x)); + } + + __device__ __forceinline__ void move_forward() { + g_ptr += WarpTileConfig::warp_tile_k * ch_stride; + } +}; +#undef MEGDNN_COMMA + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh b/dnn/src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh new file mode 100644 index 00000000..5d6a6150 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh @@ -0,0 +1,41 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer.cuh" +#include "src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_unroll_width.cuh" +#include "src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer.cuh" +#include "src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer_unroll_width.cuh" + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer.cuh b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer.cuh new file mode 100644 index 00000000..285962fa --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer.cuh @@ -0,0 +1,146 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvGlobalMemoryWriter { + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + + float alpha; + float beta; + int block_batch_remain; + int block_out_channel_remain; + + __device__ __forceinline__ void init(int32_t* /* smem */, + const float alpha_, + const float beta_) { + alpha = alpha_, beta = beta_; + } + + template + __device__ __forceinline__ void write(BiasVisitor bias, Epilogue epilogue, + BlockConsumer block_consumer) { + static constexpr bool use_wide_store = !(RegBlockConfig::reg_n & 0x1); + static constexpr int pack_size_bit = RegBlockConfig::pack_size_bit; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_m_packed; ++i) { +#pragma unroll + for (int j = 0; j < (RegBlockConfig::reg_n >> 1); ++j) { + int j2 = (j << 1); + int out_channel = ((tidy + i * ThreadConfig::nr_thread_y) + << pack_size_bit); + int batch = (tidx << 1) + j2 * ThreadConfig::nr_thread_x; + int ipack = (i << pack_size_bit); + float4 f_conv0 = + make_float4(block_consumer.reg_acc[j2][ipack], + block_consumer.reg_acc[j2][ipack + 1], + block_consumer.reg_acc[j2][ipack + 2], + block_consumer.reg_acc[j2][ipack + 3]); + float4 f_conv1 = make_float4( + block_consumer.reg_acc[j2 + 1][ipack], + block_consumer.reg_acc[j2 + 1][ipack + 1], + block_consumer.reg_acc[j2 + 1][ipack + 2], + block_consumer.reg_acc[j2 + 1][ipack + 3]); + if (!check_bounds) { + float4 f_bias0 = bias.at(batch, out_channel, 0, 0); + float4 f_bias1 = bias.at(batch + 1, out_channel, 0, 0); + epilogue.apply(alpha, f_conv0, f_conv1, beta, f_bias0, + f_bias1, batch, out_channel, 0, 0); + } else if (out_channel < block_out_channel_remain) { + if (((block_batch_remain & 0x1) == 0) && + batch + 2 <= block_batch_remain) { + float4 f_bias0 = bias.at(batch, out_channel, 0, 0); + float4 f_bias1 = + bias.at(batch + 1, out_channel, 0, 0); + epilogue.apply(alpha, f_conv0, f_conv1, beta, + f_bias0, f_bias1, batch, out_channel, + 0, 0); + } else { +#define store(_i) \ + if (batch + (_i) < block_batch_remain) { \ + float4 f_bias##_i = bias.at(batch + (_i), out_channel, 0, 0); \ + epilogue.apply(alpha, f_conv##_i, beta, f_bias##_i, batch + (_i), \ + out_channel, 0, 0); \ + } + store(0); + store(1); +#undef store + } + } + } + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_m_packed; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_n; ++j) { + int out_channel = ((tidy + i * ThreadConfig::nr_thread_y) + << pack_size_bit); + int batch = tidx + j * ThreadConfig::nr_thread_x; + int ipack = (i << pack_size_bit); + if (check_bounds && + (out_channel >= block_out_channel_remain || + batch >= block_batch_remain)) { + } else { + float4 f_conv = make_float4( + block_consumer.reg_acc[j][ipack], + block_consumer.reg_acc[j][ipack + 1], + block_consumer.reg_acc[j][ipack + 2], + block_consumer.reg_acc[j][ipack + 3]); + float4 f_bias = bias.at(batch, out_channel, 0, 0); + epilogue.apply(alpha, f_conv, beta, f_bias, batch, + out_channel, 0, 0); + } + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_coxhw.cuh b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_coxhw.cuh new file mode 100644 index 00000000..82d26649 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_coxhw.cuh @@ -0,0 +1,158 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_coxhw.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvGlobalMemoryWriter_COxHW { + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + + float alpha; + float beta; + int block_out_height_width_remain; + int block_out_channel_remain; + + __device__ __forceinline__ void init(int32_t* /* smem */, + const float alpha_, + const float beta_) { + alpha = alpha_, beta = beta_; + } + + template + __device__ __forceinline__ void write(BiasVisitor bias, Epilogue epilogue, + BlockConsumer block_consumer) { + static constexpr bool use_wide_store = + !(RegBlockConfig::reg_width & 0x1); + static constexpr int pack_size_bit = RegBlockConfig::pack_size_bit; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_m_packed; ++i) { +#pragma unroll + for (int j = 0; j < (RegBlockConfig::reg_width >> 1); ++j) { + int j2 = (j << 1); + int out_channel = ((tidy + i * ThreadConfig::nr_thread_y) + << pack_size_bit); + int out_height_width = + (tidx << 1) + j2 * ThreadConfig::nr_thread_x; + int ipack = (i << pack_size_bit); + float4 f_conv0 = + make_float4(block_consumer.reg_acc[j2][ipack], + block_consumer.reg_acc[j2][ipack + 1], + block_consumer.reg_acc[j2][ipack + 2], + block_consumer.reg_acc[j2][ipack + 3]); + float4 f_conv1 = make_float4( + block_consumer.reg_acc[j2 + 1][ipack], + block_consumer.reg_acc[j2 + 1][ipack + 1], + block_consumer.reg_acc[j2 + 1][ipack + 2], + block_consumer.reg_acc[j2 + 1][ipack + 3]); +// if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && i == 0 && j == 0) { +// printf("acc = %f, %f, %f, %f\n", f_conv0.x, f_conv0.y, f_conv0.z, f_conv0.w); +// } + + if (!check_bounds) { + float4 f_bias0 = + bias.at(0, out_channel, out_height_width); + float4 f_bias1 = + bias.at(0, out_channel, out_height_width + 1); + epilogue.apply(alpha, f_conv0, f_conv1, beta, f_bias0, + f_bias1, 0, out_channel, + out_height_width); + } else if (out_channel < block_out_channel_remain) { + if (((block_out_height_width_remain & 0x1) == 0) && + out_height_width + 2 <= + block_out_height_width_remain) { + float4 f_bias0 = + bias.at(0, out_channel, out_height_width); + float4 f_bias1 = bias.at(0, out_channel, + out_height_width + 1); + epilogue.apply(alpha, f_conv0, f_conv1, beta, + f_bias0, f_bias1, 0, out_channel, + out_height_width); + } else { +#define store(_i) \ + if (out_height_width + (_i) < block_out_height_width_remain) { \ + float4 f_bias##_i = bias.at(0, out_channel, out_height_width); \ + epilogue.apply(alpha, f_conv##_i, beta, f_bias##_i, 0, out_channel, \ + out_height_width + (_i)); \ + } + store(0); + store(1); +#undef store + } + } + } + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_m_packed; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { + int out_channel = ((tidy + i * ThreadConfig::nr_thread_y) + << pack_size_bit); + int out_height_width = tidx + j * ThreadConfig::nr_thread_x; + int ipack = (i << pack_size_bit); + if (check_bounds && + (out_channel >= block_out_channel_remain || + out_height_width >= block_out_height_width_remain)) { + } else { + float4 f_conv = make_float4( + block_consumer.reg_acc[j][ipack], + block_consumer.reg_acc[j][ipack + 1], + block_consumer.reg_acc[j][ipack + 2], + block_consumer.reg_acc[j][ipack + 3]); + float4 f_bias = + bias.at(0, out_channel, out_height_width); + epilogue.apply(alpha, f_conv, beta, f_bias, 0, + out_channel, out_height_width); + } + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_unroll_width.cuh b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_unroll_width.cuh new file mode 100644 index 00000000..ad5ec13a --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_unroll_width.cuh @@ -0,0 +1,158 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_writer/iconv_global_memory_writer_unroll_width.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +namespace megdnn { +namespace cuda { +namespace convolution { +template +struct IConvGlobalMemoryWriterUnrollWidth { + using RegBlockConfig = RegBlockConfig_; + using ThreadConfig = ThreadConfig_; + + float alpha; + float beta; + int block_batch_remain; + int block_out_channel_remain; + + __device__ __forceinline__ void init(int32_t* /* smem */, + const float alpha_, + const float beta_) { + alpha = alpha_, beta = beta_; + } + + template + __device__ __forceinline__ void write(BiasVisitor bias, Epilogue epilogue, + BlockConsumer block_consumer) { + static constexpr bool use_wide_store = !(RegBlockConfig::reg_n & 0x1); + static constexpr int pack_size_bit = RegBlockConfig::pack_size_bit; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + if (use_wide_store) { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_m_packed; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { +#pragma unroll + for (int k = 0; k < (RegBlockConfig::reg_n >> 1); ++k) { + int k2 = (k << 1); + int out_channel = + ((tidy + i * ThreadConfig::nr_thread_y) + << pack_size_bit); + int batch = + (tidx << 1) + k2 * ThreadConfig::nr_thread_x; + int ipack = (i << pack_size_bit); + float4 f_conv0 = make_float4( + block_consumer.reg_acc[k2][j][ipack], + block_consumer.reg_acc[k2][j][ipack + 1], + block_consumer.reg_acc[k2][j][ipack + 2], + block_consumer.reg_acc[k2][j][ipack + 3]); + float4 f_conv1 = make_float4( + block_consumer.reg_acc[k2 + 1][j][ipack], + block_consumer.reg_acc[k2 + 1][j][ipack + 1], + block_consumer.reg_acc[k2 + 1][j][ipack + 2], + block_consumer.reg_acc[k2 + 1][j][ipack + 3]); + if (!check_bounds) { + float4 f_bias0 = bias.at(batch, out_channel, 0, j); + float4 f_bias1 = + bias.at(batch + 1, out_channel, 0, j); + epilogue.apply(alpha, f_conv0, f_conv1, beta, + f_bias0, f_bias1, batch, out_channel, + 0, j); + } else if (out_channel < block_out_channel_remain) { + if (((block_batch_remain & 0x1) == 0) && + batch + 2 <= block_batch_remain) { + float4 f_bias0 = + bias.at(batch, out_channel, 0, j); + float4 f_bias1 = + bias.at(batch + 1, out_channel, 0, j); + epilogue.apply(alpha, f_conv0, f_conv1, beta, + f_bias0, f_bias1, batch, + out_channel, 0, j); + } else { +#define store(_i) \ + if (batch + (_i) < block_batch_remain) { \ + float4 f_bias##_i = bias.at(batch + (_i), out_channel, 0, j); \ + epilogue.apply(alpha, f_conv##_i, beta, f_bias##_i, batch + (_i), \ + out_channel, 0, j); \ + } + store(0); + store(1); +#undef store + } + } + } + } + } + } else { +#pragma unroll + for (int i = 0; i < RegBlockConfig::reg_m_packed; ++i) { +#pragma unroll + for (int j = 0; j < RegBlockConfig::reg_width; ++j) { +#pragma unroll + for (int k = 0; k < RegBlockConfig::reg_n; ++k) { + int out_channel = + ((tidy + i * ThreadConfig::nr_thread_y) + << pack_size_bit); + int batch = tidx + k * ThreadConfig::nr_thread_x; + int ipack = (i << pack_size_bit); + if (check_bounds && + (out_channel >= block_out_channel_remain || + batch >= block_batch_remain)) { + } else { + float4 f_conv = make_float4( + block_consumer.reg_acc[k][j][ipack], + block_consumer.reg_acc[k][j][ipack + 1], + block_consumer.reg_acc[k][j][ipack + 2], + block_consumer.reg_acc[k][j][ipack + 3]); + float4 f_bias = bias.at(batch, out_channel, 0, j); + epilogue.apply(alpha, f_conv, beta, f_bias, batch, + out_channel, 0, j); + } + } + } + } + } + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer.cuh b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer.cuh new file mode 100644 index 00000000..ff5b1de1 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer.cuh @@ -0,0 +1,274 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#if CUDA_VERSION >= 10000 +#include +#endif + +namespace megdnn { +namespace cuda { +namespace convolution { +#if __CUDA_ARCH__ >= 730 +using namespace nvcuda; +#endif + +template +struct IConvIMMAGlobalMemoryWriter { + using IMMAConfig = typename GlobalMemoryStoreCount::IMMAConfig; + using WarpTileConfig = typename GlobalMemoryStoreCount::WarpTileConfig; + using ThreadConfig = typename GlobalMemoryStoreCount::ThreadConfig; + using st_type = typename GlobalMemoryStoreCount::copy_t; + static constexpr bool use_wide_store = !(WarpTileConfig::warp_tile_n & 0x1); + static constexpr int pack_size = WarpTileConfig::pack_size; + + int32_t* smem; + float alpha; + float beta; + int block_batch_remain; + int block_out_channel_remain; + + __device__ __forceinline__ void init(int32_t* smem_, const float alpha_, + const float beta_) { + smem = smem_; + alpha = alpha_, beta = beta_; + } + + template + __device__ __forceinline__ void write(BiasVisitor bias, Epilogue epilogue, + BlockConsumer block_consumer) { +#if __CUDA_ARCH__ >= 730 + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int warpx = tidx / ThreadConfig::warp_size; + const int warpy = tidy; + const int idx_intra_warp = tidx & (ThreadConfig::warp_size - 1); + + // store fragment to share memory + if (use_wide_store) { + const int warpx2 = (warpx << 1); + int32_t* st_sh_frag_ptr = + smem + + (warpy * ThreadConfig::nr_warp_x + warpx) * + (IMMAConfig::wmma_m * IMMAConfig::wmma_n << 1); +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma urnoll + for (int j = 0; j < (WarpTileConfig::warp_tile_n >> 1); ++j) { + int j2 = (j << 1); + static int const wmma_n2 = (IMMAConfig::wmma_n << 1); + wmma::store_matrix_sync(st_sh_frag_ptr, + block_consumer.frag_acc[i][j2], + wmma_n2, wmma::mem_row_major); + wmma::store_matrix_sync(st_sh_frag_ptr + IMMAConfig::wmma_n, + block_consumer.frag_acc[i][j2 + 1], + wmma_n2, wmma::mem_row_major); + + const int sh_st_y = + idx_intra_warp / GlobalMemoryStoreCount::store_x; + const int sh_st_x = + idx_intra_warp - + sh_st_y * GlobalMemoryStoreCount::store_x; + const int wmma_tile_h_base = (sh_st_y << 2); + const int wmma_tile_w = + sh_st_x * GlobalMemoryStoreCount::store_width; + if (wmma_tile_h_base + 4 <= IMMAConfig::wmma_m) { + int const b0 = (warpx2 + j2 * ThreadConfig::nr_warp_x) * + IMMAConfig::wmma_n + + wmma_tile_w; + int const ch = (warpy + i * ThreadConfig::nr_warp_y) * + IMMAConfig::wmma_m + + wmma_tile_h_base; + int const b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3; + + st_type lane0 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 0) * + wmma_n2 + + wmma_tile_w])); + st_type lane1 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 1) * + wmma_n2 + + wmma_tile_w])); + st_type lane2 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 2) * + wmma_n2 + + wmma_tile_w])); + st_type lane3 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 3) * + wmma_n2 + + wmma_tile_w])); + + float4 f_conv0 = ::make_float4(lane0.x, lane1.x, + lane2.x, lane3.x); + float4 f_conv1 = ::make_float4(lane0.y, lane1.y, + lane2.y, lane3.y); + float4 f_conv2 = ::make_float4(lane0.z, lane1.z, + lane2.z, lane3.z); + float4 f_conv3 = ::make_float4(lane0.w, lane1.w, + lane2.w, lane3.w); + + // store to global memory + if (!check_bounds) { + float4 f_bias0 = bias.at(b0, ch, 0, 0); + float4 f_bias1 = bias.at(b1, ch, 0, 0); + float4 f_bias2 = bias.at(b2, ch, 0, 0); + float4 f_bias3 = bias.at(b3, ch, 0, 0); + + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, 0); + } else if (ch < block_out_channel_remain) { + if (((block_batch_remain & 0x3) == 0) && + b0 + 4 <= block_batch_remain) { + float4 f_bias0 = bias.at(b0, ch, 0, 0); + float4 f_bias1 = bias.at(b1, ch, 0, 0); + float4 f_bias2 = bias.at(b2, ch, 0, 0); + float4 f_bias3 = bias.at(b3, ch, 0, 0); + + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, 0); + } else { +#define store(_idx) \ + if (b0 + _idx < block_batch_remain) { \ + float4 f_bias = bias.at(b##_idx, ch, 0, 0); \ + epilogue.apply(alpha, f_conv##_idx, beta, f_bias, b##_idx, ch, 0, 0); \ + } + store(0); + store(1); + store(2); + store(3); + } + } // end if check bounds + } // end if store bound + } // end j + } // end i + } else { + int32_t* st_sh_frag_ptr = + smem + (warpy * ThreadConfig::nr_warp_x + warpx) * + IMMAConfig::wmma_m * IMMAConfig::wmma_n; + +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma urnoll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::store_matrix_sync( + st_sh_frag_ptr, block_consumer.frag_acc[i][j], + IMMAConfig::wmma_n, wmma::mem_row_major); + const int sh_st_y = + idx_intra_warp / GlobalMemoryStoreCount::store_x; + const int sh_st_x = + idx_intra_warp - + sh_st_y * GlobalMemoryStoreCount::store_x; + const int wmma_tile_h_base = (sh_st_y << 2); + const int wmma_tile_w = + sh_st_x * GlobalMemoryStoreCount::store_width; + if (wmma_tile_h_base + 4 <= IMMAConfig::wmma_m) { + int const b0 = (warpx + j * ThreadConfig::nr_warp_x) * + IMMAConfig::wmma_n + + wmma_tile_w; + int const ch = (warpy + i * ThreadConfig::nr_warp_y) * + IMMAConfig::wmma_m + + wmma_tile_h_base; + int const b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3; + + st_type lane0 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 0) * + IMMAConfig::wmma_n + + wmma_tile_w])); + st_type lane1 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 1) * + IMMAConfig::wmma_n + + wmma_tile_w])); + st_type lane2 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 2) * + IMMAConfig::wmma_n + + wmma_tile_w])); + st_type lane3 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 3) * + IMMAConfig::wmma_n + + wmma_tile_w])); + + float4 f_conv0 = ::make_float4(lane0.x, lane1.x, + lane2.x, lane3.x); + float4 f_conv1 = ::make_float4(lane0.y, lane1.y, + lane2.y, lane3.y); + float4 f_conv2 = ::make_float4(lane0.z, lane1.z, + lane2.z, lane3.z); + float4 f_conv3 = ::make_float4(lane0.w, lane1.w, + lane2.w, lane3.w); + + // store to global memory + if (!check_bounds) { + float4 f_bias0 = bias.at(b0, ch, 0, 0); + float4 f_bias1 = bias.at(b1, ch, 0, 0); + float4 f_bias2 = bias.at(b2, ch, 0, 0); + float4 f_bias3 = bias.at(b3, ch, 0, 0); + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, 0); + } else if (ch < block_out_channel_remain) { + if ((block_batch_remain & 0x3) == 0 && + b0 + 4 <= block_batch_remain) { + float4 f_bias0 = bias.at(b0, ch, 0, 0); + float4 f_bias1 = bias.at(b1, ch, 0, 0); + float4 f_bias2 = bias.at(b2, ch, 0, 0); + float4 f_bias3 = bias.at(b3, ch, 0, 0); + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, 0); + } else { + store(0); + store(1); + store(2); + store(3); +#undef store + } + } // end if check bounds + } // end if store bound + } // end j + } // end i + } +#endif + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer_unroll_width.cuh b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer_unroll_width.cuh new file mode 100644 index 00000000..04cf044f --- /dev/null +++ b/dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer_unroll_width.cuh @@ -0,0 +1,280 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/global_memory_writer/iconv_imma_global_memory_writer_unroll_width.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#if CUDA_VERSION >= 10000 +#include +#endif + +namespace megdnn { +namespace cuda { +namespace convolution { +#if __CUDA_ARCH__ >= 730 +using namespace nvcuda; +#endif + +template +struct IConvIMMAGlobalMemoryWriterUnrollWidth { + using IMMAConfig = typename GlobalMemoryStoreCount::IMMAConfig; + using WarpTileConfig = typename GlobalMemoryStoreCount::WarpTileConfig; + using ThreadConfig = typename GlobalMemoryStoreCount::ThreadConfig; + using st_type = typename GlobalMemoryStoreCount::copy_t; + static constexpr bool consecutive_width_tile = + GlobalMemoryStoreCount::consecutive_width_tile; + static constexpr int pack_size = WarpTileConfig::pack_size; + + int32_t* smem; + float alpha; + float beta; + int block_batch_remain; + int block_out_channel_remain; + + __device__ __forceinline__ void init(int32_t* smem_, const float alpha_, + const float beta_) { + smem = smem_; + alpha = alpha_, beta = beta_; + } + + template + __device__ __forceinline__ void write(BiasVisitor bias, Epilogue epilogue, + BlockConsumer block_consumer) { +#if __CUDA_ARCH__ >= 730 + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int warpx = tidx / ThreadConfig::warp_size; + const int warpy = tidy; + const int idx_intra_warp = tidx & (ThreadConfig::warp_size - 1); + + // store fragment to share memory + if (consecutive_width_tile) { + const int warpx2 = (warpx << 1); + int32_t* st_sh_frag_ptr = + smem + + (warpy * ThreadConfig::nr_warp_x + warpx) * + (IMMAConfig::wmma_m * IMMAConfig::wmma_n << 1); +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma urnoll + for (int j = 0; j < (WarpTileConfig::warp_tile_n >> 1); ++j) { + int j2 = (j << 1); + static int const wmma_n2 = (IMMAConfig::wmma_n << 1); + wmma::store_matrix_sync(st_sh_frag_ptr, + block_consumer.frag_acc[i][j2], + wmma_n2, wmma::mem_row_major); + wmma::store_matrix_sync(st_sh_frag_ptr + IMMAConfig::wmma_n, + block_consumer.frag_acc[i][j2 + 1], + wmma_n2, wmma::mem_row_major); + + const int sh_st_y = + idx_intra_warp / GlobalMemoryStoreCount::store_x; + const int sh_st_x = + idx_intra_warp - + sh_st_y * GlobalMemoryStoreCount::store_x; + const int wmma_tile_h_base = (sh_st_y << 2); + const int wmma_tile_w = + sh_st_x * GlobalMemoryStoreCount::store_width; + if (wmma_tile_h_base + 4 <= IMMAConfig::wmma_m) { + int const b0 = wmma_tile_w & (IMMAConfig::wmma_n - 1); + int const width = + (warpx2 + j2 * ThreadConfig::nr_warp_x) + + (wmma_tile_w >> IMMAConfig::wmma_n_bit); + int const ch = (warpy + i * ThreadConfig::nr_warp_y) * + IMMAConfig::wmma_m + + wmma_tile_h_base; + int const b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3; + + st_type lane0 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 0) * + wmma_n2 + + wmma_tile_w])); + st_type lane1 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 1) * + wmma_n2 + + wmma_tile_w])); + st_type lane2 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 2) * + wmma_n2 + + wmma_tile_w])); + st_type lane3 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 3) * + wmma_n2 + + wmma_tile_w])); + + float4 f_conv0 = ::make_float4(lane0.x, lane1.x, + lane2.x, lane3.x); + float4 f_conv1 = ::make_float4(lane0.y, lane1.y, + lane2.y, lane3.y); + float4 f_conv2 = ::make_float4(lane0.z, lane1.z, + lane2.z, lane3.z); + float4 f_conv3 = ::make_float4(lane0.w, lane1.w, + lane2.w, lane3.w); + + // store to global memory + if (!check_bounds) { + float4 f_bias0 = bias.at(b0, ch, 0, width); + float4 f_bias1 = bias.at(b1, ch, 0, width); + float4 f_bias2 = bias.at(b2, ch, 0, width); + float4 f_bias3 = bias.at(b3, ch, 0, width); + + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, width); + } else if (ch < block_out_channel_remain) { + if ((block_batch_remain & 0x3) == 0 && + b0 + 4 <= block_batch_remain) { + float4 f_bias0 = bias.at(b0, ch, 0, width); + float4 f_bias1 = bias.at(b1, ch, 0, width); + float4 f_bias2 = bias.at(b2, ch, 0, width); + float4 f_bias3 = bias.at(b3, ch, 0, width); + + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, + width); + } else { +#define store(_idx) \ + if (b0 + _idx < block_batch_remain) { \ + float4 f_bias = bias.at(b##_idx, ch, 0, width); \ + epilogue.apply(alpha, f_conv##_idx, beta, f_bias, b##_idx, ch, 0, \ + width); \ + } + store(0); + store(1); + store(2); + store(3); + } + } // end if check bounds + } // end if store bound + } // end j + } // end i + } else { + int32_t* st_sh_frag_ptr = + smem + (warpy * ThreadConfig::nr_warp_x + warpx) * + IMMAConfig::wmma_m * IMMAConfig::wmma_n; + +#pragma unroll + for (int i = 0; i < WarpTileConfig::warp_tile_m; ++i) { +#pragma urnoll + for (int j = 0; j < WarpTileConfig::warp_tile_n; ++j) { + wmma::store_matrix_sync( + st_sh_frag_ptr, block_consumer.frag_acc[i][j], + IMMAConfig::wmma_n, wmma::mem_row_major); + const int sh_st_y = + idx_intra_warp / GlobalMemoryStoreCount::store_x; + const int sh_st_x = + idx_intra_warp - + sh_st_y * GlobalMemoryStoreCount::store_x; + const int wmma_tile_h_base = (sh_st_y << 2); + const int wmma_tile_w = + sh_st_x * GlobalMemoryStoreCount::store_width; + if (wmma_tile_h_base + 4 <= IMMAConfig::wmma_m) { + int const b0 = wmma_tile_w; + int const width = warpx + j * ThreadConfig::nr_warp_x; + int const ch = (warpy + i * ThreadConfig::nr_warp_y) * + IMMAConfig::wmma_m + + wmma_tile_h_base; + int const b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3; + + st_type lane0 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 0) * + IMMAConfig::wmma_n + + wmma_tile_w])); + st_type lane1 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 1) * + IMMAConfig::wmma_n + + wmma_tile_w])); + st_type lane2 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 2) * + IMMAConfig::wmma_n + + wmma_tile_w])); + st_type lane3 = *(reinterpret_cast( + &st_sh_frag_ptr[(wmma_tile_h_base + 3) * + IMMAConfig::wmma_n + + wmma_tile_w])); + + float4 f_conv0 = ::make_float4(lane0.x, lane1.x, + lane2.x, lane3.x); + float4 f_conv1 = ::make_float4(lane0.y, lane1.y, + lane2.y, lane3.y); + float4 f_conv2 = ::make_float4(lane0.z, lane1.z, + lane2.z, lane3.z); + float4 f_conv3 = ::make_float4(lane0.w, lane1.w, + lane2.w, lane3.w); + + // store to global memory + if (!check_bounds) { + float4 f_bias0 = bias.at(b0, ch, 0, width); + float4 f_bias1 = bias.at(b1, ch, 0, width); + float4 f_bias2 = bias.at(b2, ch, 0, width); + float4 f_bias3 = bias.at(b3, ch, 0, width); + + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, width); + } else if (ch < block_out_channel_remain) { + if ((block_batch_remain & 0x3) == 0 && + b0 + 4 <= block_batch_remain) { + float4 f_bias0 = bias.at(b0, ch, 0, width); + float4 f_bias1 = bias.at(b1, ch, 0, width); + float4 f_bias2 = bias.at(b2, ch, 0, width); + float4 f_bias3 = bias.at(b3, ch, 0, width); + + epilogue.apply(alpha, f_conv0, f_conv1, f_conv2, + f_conv3, beta, f_bias0, f_bias1, + f_bias2, f_bias3, b0, ch, 0, + width); + } else { + store(0); + store(1); + store(2); + store(3); +#undef store + } + } // end if check bounds + } // end if store bound + } // end j + } // end i + } +#endif + } +}; + +} // namespace cuda +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/kernel.cuh b/dnn/src/cuda/convolution_helper/kernel.cuh new file mode 100644 index 00000000..72b598fe --- /dev/null +++ b/dnn/src/cuda/convolution_helper/kernel.cuh @@ -0,0 +1,165 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/kernel.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/convolution_helper/bias_visitor.cuh" +#include "src/cuda/convolution_helper/config.cuh" +#include "src/cuda/convolution_helper/conv_trait/conv_trait.cuh" +#include "src/cuda/convolution_helper/epilogue.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +template +__global__ void convolution_kernel( + const typename ConvTrait::src_dtype* __restrict__ src, + const typename ConvTrait::filter_dtype* __restrict__ filter, + BiasVisitor bias, Epilogue epilogue, typename ConvTrait::Param param, + float alpha, float beta) { + static bool constexpr check_bounds = ConvTrait::check_bounds; + using BlockTileIterator = typename ConvTrait::BlockTileIterator; + BlockTileIterator block_iterator; + // determine batch, out_channel, out_height, out_width of current thread + // block + block_iterator.template init_with_param(param); + + using DataTileCount = typename ConvTrait::DataTileCount; + using FilterTileCount = typename ConvTrait::FilterTileCount; + + using DataGlobal2ShareMemVisitor = + typename ConvTrait::DataGlobal2ShareMemVisitor; + using FilterGlobal2ShareMemVisitor = + typename ConvTrait::FilterGlobal2ShareMemVisitor; + + using smem_storage_dtype = typename ConvTrait::smem_storage_dtype; + extern __shared__ smem_storage_dtype smem[]; + smem_storage_dtype* smem_src = smem; + smem_storage_dtype* smem_filter = smem + DataTileCount::smem_tot; + smem_storage_dtype* smem_dst = smem_filter + FilterTileCount::smem_tot; + + DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src}; + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; + if (check_bounds) { + block_iterator.template set_remain(src_gl2sh_visitor, + filter_gl2sh_visitor); + } + + using BlockConsumer = typename ConvTrait::BlockConsumer; + BlockConsumer block_consumer; + block_consumer.init_accumulator(); + + block_iterator.template iterate_with_param( + src, filter, param, src_gl2sh_visitor, filter_gl2sh_visitor, + block_consumer); + + using GlobalMemoryWriter = typename ConvTrait::GlobalMemoryWriter; + GlobalMemoryWriter global_memory_writer; + global_memory_writer.init(smem_dst, alpha, beta); + if (check_bounds) { + block_iterator.template set_remain(global_memory_writer); + } + bias.move(block_iterator.block_batch, block_iterator.block_out_channel, + block_iterator.block_out_height, block_iterator.block_out_width); + epilogue.move(block_iterator.block_batch, block_iterator.block_out_channel, + block_iterator.block_out_height, + block_iterator.block_out_width); + global_memory_writer.template write(bias, epilogue, + block_consumer); +} + +template +__global__ void convolution_kernel_precomp_offset( + const typename ConvTrait::src_dtype* __restrict__ src, + const typename ConvTrait::filter_dtype* __restrict__ filter, + const int* __restrict__ offset, BiasVisitor bias, Epilogue epilogue, + typename ConvTrait::Param param, float alpha, float beta) { + static bool constexpr check_bounds = ConvTrait::check_bounds; + using BlockTileIterator = typename ConvTrait::BlockTileIterator; + BlockTileIterator block_iterator; + // determine batch, out_channel, out_height, out_width of current thread + // block + block_iterator.template init_with_param(param); + + using DataTileCount = typename ConvTrait::DataTileCount; + using FilterTileCount = typename ConvTrait::FilterTileCount; + + using DataGlobal2ShareMemVisitor = + typename ConvTrait::DataGlobal2ShareMemVisitor; + using FilterGlobal2ShareMemVisitor = + typename ConvTrait::FilterGlobal2ShareMemVisitor; + + using smem_storage_dtype = typename ConvTrait::smem_storage_dtype; + extern __shared__ smem_storage_dtype smem[]; + smem_storage_dtype* smem_src = smem; + smem_storage_dtype* smem_filter = smem + DataTileCount::smem_tot; + smem_storage_dtype* smem_dst = smem_filter + FilterTileCount::smem_tot; + + DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset}; + FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; + if (check_bounds) { + block_iterator.template set_remain(src_gl2sh_visitor, + filter_gl2sh_visitor); + } + + using BlockConsumer = typename ConvTrait::BlockConsumer; + BlockConsumer block_consumer; + block_consumer.init_accumulator(); + + block_iterator.template iterate_with_param( + src, filter, param, src_gl2sh_visitor, filter_gl2sh_visitor, + block_consumer); + + using GlobalMemoryWriter = typename ConvTrait::GlobalMemoryWriter; + GlobalMemoryWriter global_memory_writer; + global_memory_writer.init(smem_dst, alpha, beta); + if (check_bounds) { + block_iterator.template set_remain(global_memory_writer); + } + bias.move(block_iterator.block_batch, block_iterator.block_out_channel, + block_iterator.block_out_height, block_iterator.block_out_width); + epilogue.move(block_iterator.block_batch, block_iterator.block_out_channel, + block_iterator.block_out_height, + block_iterator.block_out_width); + global_memory_writer.template write(bias, epilogue, + block_consumer); +} + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/layout.cuh b/dnn/src/cuda/convolution_helper/layout.cuh new file mode 100644 index 00000000..015c3090 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/layout.cuh @@ -0,0 +1,129 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/layout.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { + +enum Format { CHWN4, CHWN16, NCHW4 }; + +template +struct Layout; + +template <> +struct Layout { + static constexpr Format format = Format::CHWN4; + int batch_stride; + int channel_stride; + int height_stride; + int width_stride; + + __host__ __device__ __forceinline__ void init(const int batch, + const int /* channel */, + const int height, + const int width) { + batch_stride = 4; + channel_stride = height * width * batch * 4; + height_stride = width * batch * 4; + width_stride = batch * 4; + } + + __device__ __forceinline__ size_t offset(const int batch, const int channel, + const int height, + const int width) { + return batch * batch_stride + (channel >> 2) * channel_stride + + height * height_stride + width * width_stride; + } +}; + +template <> +struct Layout { + static constexpr Format format = Format::CHWN16; + int batch_stride; + int channel_stride; + int height_stride; + int width_stride; + + __host__ __device__ __forceinline__ void init(const int batch, + const int /* channel */, + const int height, + const int width) { + batch_stride = 16; + channel_stride = height * width * batch * 16; + height_stride = width * batch * 16; + width_stride = batch * 16; + } + + __device__ __forceinline__ size_t offset(const int batch, const int channel, + const int height, + const int width) { + return batch * batch_stride + (channel >> 4) * channel_stride + + height * height_stride + width * width_stride; + } +}; + +template <> +struct Layout { + static constexpr Format format = Format::NCHW4; + int batch_stride; + int channel_stride; + int height_stride; + int width_stride; + + __host__ __device__ __forceinline__ void init(const int /* batch */, + const int channel, + const int height, + const int width) { + batch_stride = channel * height * width; + channel_stride = height * width * 4; + height_stride = width * 4; + width_stride = 4; + } + + __device__ __forceinline__ size_t offset(const int batch, const int channel, + const int height, + const int width) { + return batch * batch_stride + (channel >> 2) * channel_stride + + height * height_stride + width * width_stride; + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/parameter.cuh b/dnn/src/cuda/convolution_helper/parameter.cuh new file mode 100644 index 00000000..9ea422c4 --- /dev/null +++ b/dnn/src/cuda/convolution_helper/parameter.cuh @@ -0,0 +1,49 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/parameter.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +namespace megdnn { +namespace cuda { +namespace convolution { + +struct ConvParam { + int n, co, ci, hi, wi, ho, wo, ph, pw, sh, sw, fh, fw; +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convolution_helper/prologue.cuh b/dnn/src/cuda/convolution_helper/prologue.cuh new file mode 100644 index 00000000..91d5539d --- /dev/null +++ b/dnn/src/cuda/convolution_helper/prologue.cuh @@ -0,0 +1,66 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/convolution_helper/prologue.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace convolution { +struct ConvPrologue { + template + static __device__ __forceinline__ void prologue( + const src_dtype* __restrict__& /* src */, + const filter_dtype* __restrict__& /* filter */, + const Param& /* param */, const int /* batch */, + const int /* channel */, const int /* height */, + const int /* width */) {} +}; + +struct BatchConvPrologue { + template + static __device__ __forceinline__ void prologue( + const src_dtype* __restrict__& /* src */, + const filter_dtype* __restrict__& filter, const Param& param, + const int batch, const int /* channel */, const int /* height */, + const int /* width */) { + filter += batch * param.co * param.ci * param.fh * param.fw; + } +}; + +} // namespace convolution +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convpooling/conv_pooling.cuh b/dnn/src/cuda/convpooling/conv_pooling.cuh new file mode 100644 index 00000000..f7d19cfc --- /dev/null +++ b/dnn/src/cuda/convpooling/conv_pooling.cuh @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/convpooling/conv_pooling.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include "./conv_pooling.h" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +template +__global__ void kern_xcorr_smallkern_pool( + float *input, + const float *filter, + float *output, + const float *output_bias, + cudaTextureObject_t m_tex, + int IC, int IH, int IW, + int OH, int OW); + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/conv_pooling.h b/dnn/src/cuda/convpooling/conv_pooling.h new file mode 100644 index 00000000..f9658688 --- /dev/null +++ b/dnn/src/cuda/convpooling/conv_pooling.h @@ -0,0 +1,62 @@ +/** + * \file dnn/src/cuda/convpooling/conv_pooling.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +#define NR_PXL_PER_THREAD 4 +#define NR_THREAD_PER_BLOCK 192 +#define MAX_SHARED_MEM_SIZE 32768 //32 * 1024 +#define MAX_TEX_OBJ_SIZE 134217728 //2^27 +#define HEIGHT_EQUALS_WITH_WEIGHT + +enum PoolModeCu { + AVERAGE = 0, + MAX = 1 +}; + +enum ConvModeCu { + CROSS_CORRELATION = 0, + CONVOLUTION = 1 +}; + +enum NonlineModeCu{ + IDENTITY = 0, + RELU = 1, + SIGMOID = 2 +}; + +void start_gpu_xcorr_pool_with_texture_obj( + cudaStream_t stream, + float *input, + const float *kernel, + float *output, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t /*PH*/, size_t /*PW*/, + size_t /*SH*/, size_t /*SW*/, + size_t pool_shape_h, + size_t pool_shape_w, + PoolModeCu poolMode, + ConvModeCu convMode, + NonlineModeCu nonlineMode, + const float *bias); + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/conv_pooling_tex.cu b/dnn/src/cuda/convpooling/conv_pooling_tex.cu new file mode 100644 index 00000000..c31c63b9 --- /dev/null +++ b/dnn/src/cuda/convpooling/conv_pooling_tex.cu @@ -0,0 +1,254 @@ +/** + * \file dnn/src/cuda/convpooling/conv_pooling_tex.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./conv_pooling.cuh" +//#include "./kernel_impl/kernel_impl.h" +#include "./conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +#define NR_PXL_PER_THREAD 4 +#define NR_THREAD_PER_BLOCK 192 +#define MAX_SHARED_MEM_SIZE 32768 //32 * 1024 +#define MAX_TEX_OBJ_SIZE 134217728 //2^27 +#define HEIGHT_EQUALS_WITH_WEIGHT + + + __host__ void create_cuda_tex(float *input, cudaTextureObject_t& tex, + size_t N, size_t IC, size_t IH, size_t IW) { + + struct cudaResourceDesc res_desc; + memset(&res_desc, 0, sizeof(res_desc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = (void*)input; + res_desc.res.linear.sizeInBytes = N * IC * IH * IW * sizeof(float); + res_desc.res.linear.desc = cudaCreateChannelDesc(); + + cudaTextureDesc tex_desc; + memset(&tex_desc, 0, sizeof(tex_desc)); + tex_desc.addressMode[0] = cudaAddressModeClamp; + tex_desc.addressMode[1] = cudaAddressModeClamp; + tex_desc.addressMode[2] = cudaAddressModeClamp; + tex_desc.readMode = cudaReadModeElementType; + CUDA_CHKERR(cudaCreateTextureObject( + &tex, &res_desc, &tex_desc, NULL)); + +} + +void start_gpu_xcorr_pool_with_texture_obj( + cudaStream_t stream, + float *input, + const float *kernel, + float *output, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t /*PH*/, size_t /*PW*/, + size_t /*SH*/, size_t /*SW*/, + size_t pool_shape_h, + size_t pool_shape_w, + PoolModeCu poolMode, + ConvModeCu convMode, + NonlineModeCu nonlineMode, + const float *bias) { + + int nr_batch = N, nr_oc = OC, + output_area2d = OH * OW, + kern_h = FH, kern_w = FW, + nr_thread_per_block = std::min(NR_THREAD_PER_BLOCK, + align_to_warp(output_area2d)), + oplane_nr_split = std::max(1, + output_area2d / (nr_thread_per_block * NR_PXL_PER_THREAD)), + share_size = kern_h * kern_w * IC * sizeof(float); + megdnn_assert(share_size < MAX_SHARED_MEM_SIZE, "kernel too large: " + "total %d bytes per output channel allowed, got %d", + MAX_SHARED_MEM_SIZE, share_size); + + void (*f) (float *input, + const float *filter, + float *output, + const float *output_bias, + cudaTextureObject_t m_tex, + int IC, int IH, int IW, + int OH, int OW) = NULL; + +#define DISPATCH_POOLMODE(nonlin, kh, kw, ph, pw, convMode) \ + do { \ + switch (poolMode) { \ + case AVERAGE: \ + f = kern_xcorr_smallkern_pool; \ + break; \ + case MAX: \ + f = kern_xcorr_smallkern_pool; \ + break; \ + } \ + } while(0) + +#define DISPATCH_CONVMODE(nonlin, kh, kw, ph, pw) \ + do { \ + switch (convMode) { \ + case CONVOLUTION: DISPATCH_POOLMODE \ + (nonlin, kh, kw, ph, pw, IdxGetterConvolution); break; \ + case CROSS_CORRELATION: DISPATCH_POOLMODE\ + (nonlin, kh, kw, ph, pw, IdxGetterCorrRel); break; \ + } \ + } while(0) + +#ifdef HEIGHT_EQUALS_WITH_WEIGHT + +#define DISPATCH_POOLSHAPE(nonlin, kh, kw) \ + do { \ + switch (pool_shape_h) { \ + case 1: DISPATCH_CONVMODE(nonlin, kh, kw, 1, 1); break; \ + case 2: DISPATCH_CONVMODE(nonlin, kh, kw, 2, 2); break; \ + case 3: DISPATCH_CONVMODE(nonlin, kh, kw, 3, 3); break; \ + case 4: DISPATCH_CONVMODE(nonlin, kh, kw, 4, 4); break; \ + } \ + } while(0) + +#define DISPATCH_KERN_H(nonlin) \ + do { \ + switch(kern_h) { \ + case 1: DISPATCH_POOLSHAPE(nonlin, 1, 1); break;\ + case 2: DISPATCH_POOLSHAPE(nonlin, 2, 2); break;\ + case 3: DISPATCH_POOLSHAPE(nonlin, 3, 3); break;\ + case 4: DISPATCH_POOLSHAPE(nonlin, 4, 4); break;\ + case 5: DISPATCH_POOLSHAPE(nonlin, 5, 5); break;\ + case 6: DISPATCH_POOLSHAPE(nonlin, 6, 6); break;\ + case 7: DISPATCH_POOLSHAPE(nonlin, 7, 7); break;\ + } \ + } while(0) + +#else //HEIGHT_EQUALS_WITH_WEIGHT + +#define DISPATCH_POOLSHAPE_W(nonlin, kh, kw, ph) \ + do { \ + switch (pool_shape_w) { \ + case 1: DISPATCH_CONVMODE(nonlin, kh, kw, ph, 1); break; \ + case 2: DISPATCH_CONVMODE(nonlin, kh, kw, ph, 2); break; \ + case 3: DISPATCH_CONVMODE(nonlin, kh, kw, ph, 3); break; \ + case 4: DISPATCH_CONVMODE(nonlin, kh, kw, ph, 4); break; \ + } \ + } while(0) + +#define DISPATCH_POOLSHAPE_H(nonlin, kern_h, kern_w) \ + do { \ + switch (pool_shape_h) { \ + case 1: DISPATCH_POOLSHAPE_W(nonlin, kern_h, kern_w, 1); break; \ + case 2: DISPATCH_POOLSHAPE_W(nonlin, kern_h, kern_w, 2); break; \ + case 3: DISPATCH_POOLSHAPE_W(nonlin, kern_h, kern_w, 3); break; \ + case 4: DISPATCH_POOLSHAPE_W(nonlin, kern_h, kern_w, 4); break; \ + } \ + } while(0) + +#define DISPATCH_KERN_W(nonlin, kern_h) \ + do { \ + switch(kern_w) { \ + case 1: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 1); break;\ + case 2: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 2); break;\ + case 3: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 3); break;\ + case 4: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 4); break;\ + case 5: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 5); break;\ + case 6: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 6); break;\ + case 7: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 7); break;\ + case 8: DISPATCH_POOLSHAPE_H(nonlin, kern_h, 8); break;\ + } \ + } while(0) + +#define DISPATCH_KERN_H(nonlin) \ + do { \ + switch(kern_h) { \ + case 1: DISPATCH_KERN_W(nonlin, 1); break;\ + case 2: DISPATCH_KERN_W(nonlin, 2); break;\ + case 3: DISPATCH_KERN_W(nonlin, 3); break;\ + case 4: DISPATCH_KERN_W(nonlin, 4); break;\ + case 5: DISPATCH_KERN_W(nonlin, 5); break;\ + case 6: DISPATCH_KERN_W(nonlin, 6); break;\ + case 7: DISPATCH_KERN_W(nonlin, 7); break;\ + case 8: DISPATCH_KERN_W(nonlin, 8); break;\ + } \ + } while(0) + +#endif //HEIGHT_EQUALS_WITH_WEIGHT + switch(nonlineMode) { + case IDENTITY: + DISPATCH_KERN_H(Identity); + break; + case RELU: + DISPATCH_KERN_H(Relu); + break; + + case SIGMOID: + DISPATCH_KERN_H(Sigmoid); + break; + } + + megdnn_assert(f, "Start_gpu_xcorr_pool: unsupported conv-pooling configuration. \ + pool_shape_h %zu, pool_shape_w %zu, kern_h %d, kern_w %d\n", + pool_shape_h, pool_shape_w, kern_h, kern_w); + + cudaTextureObject_t m_tex = 0; + size_t input_size = N * IC * IH * IW; + + // Case 1: Size of input data is less than + // the limit of cudaTextureObject_t. + if(input_size < MAX_TEX_OBJ_SIZE) { + dim3 grid_dim(nr_batch, nr_oc, oplane_nr_split), + block_dim(nr_thread_per_block); + create_cuda_tex(input, m_tex, N, IC, IH, IW); + f<<>>( + input, kernel, output, bias, m_tex, + IC, IH, IW, OH, OW); + } + // Case 2: Size of input data reached + // the limit of cudaTextureObject_t (2^27 Bytes). + else { + size_t input_stride = IC * IH * IW, + output_stride = OC * OH * OW; + int batch_size = MAX_TEX_OBJ_SIZE / input_stride; + float *input_base = input; + float *output_base = output; + for(; nr_batch > 0; nr_batch -= batch_size) { + int cur_batch = nr_batch < batch_size ? nr_batch : batch_size; + dim3 grid_dim(cur_batch, nr_oc, oplane_nr_split), + block_dim(nr_thread_per_block); + create_cuda_tex(input_base, m_tex, N, IC, IH, IW); + f<<>>( + input_base, kernel, output_base, bias, m_tex, + IC, IH, IW, OH, OW); + + input_base += batch_size * input_stride; + output_base += batch_size * output_stride; + } + } + CUDA_CHKERR(cudaPeekAtLastError()); + CUDA_CHK_KERN_ERR; + + CUDA_CHKERR(cudaDestroyTextureObject(m_tex)); + m_tex = 0; + //texinput.destory(); +} + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +#undef CUDA_CHKERR +#undef CUDA_CHK_KERN_ERR +#undef NR_PXL_PER_THREAD +#undef NR_THREAD_PER_BLOCK +#undef MAX_SHARED_MEM_SIZE +#undef MAX_TEX_OBJ_SIZE +// vim: syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/convpooling/conv_pooling_utils.cuh b/dnn/src/cuda/convpooling/conv_pooling_utils.cuh new file mode 100644 index 00000000..0d1ea97f --- /dev/null +++ b/dnn/src/cuda/convpooling/conv_pooling_utils.cuh @@ -0,0 +1,192 @@ +/** + * \file dnn/src/cuda/convpooling/conv_pooling_utils.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "src/cuda/utils.cuh" +#include +#include +#include + +//#include "./helper.cuh" + + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +#define CUDA_CHKERR(call) \ + do { \ + cudaError_t code = (call); \ + megdnn_assert(code == cudaSuccess, "cuda err %d: %s (call %s at %s:%s:%d)", \ + int(code), cudaGetErrorString(code), # call, \ + __FILE__, __func__, __LINE__); \ + } while(0) + +#define CUDA_CHK_KERN_ERR CUDA_CHKERR(cudaDeviceSynchronize()); + +static inline int __host__ align_to_warp(int n) { + int x = n / 32 * 32; + if (!x) + x = n; + return x; +} + +// --- Nonline --- +struct Relu { + static __device__ float apply(float x) { + return x > 0 ? x : 0; + } +}; + +struct Sigmoid { + static __device__ float apply(float x) { + float exp_value = exp((double) -x); + return 1 / (1 + exp_value); + } +}; + +struct Identity { + static __device__ float apply(float x) { + return x; + } +}; + +// --- Static Reduce --- +template +struct StaticReduce { + static __device__ float apply(const float *val) { + const int half = size / 2; + return Op::apply( + StaticReduce::apply(val), + StaticReduce::apply(val + half)); + } +}; + +template +struct StaticReduce<1, Op> { + static __device__ float apply(const float *val) { + return val[0]; + } +}; + +template +struct StaticReduce<2, Op> { + static __device__ float apply(const float *val) { + return Op::apply(val[0], val[1]); + } +}; + +struct OpAdd { + static __device__ float apply(float a, float b) { + return a + b; + } +}; + +struct OpMax { + static __device__ float apply(float a, float b) { + return max(a, b); + } +}; + +struct IdxGetterConvolution { + static inline __device__ int apply(int kern, int i, int p) { + return kern - i - 1 + p; + } + +}; + +struct IdxGetterCorrRel { + static inline __device__ int apply(int kern, int i, int p) { + return i - p; + } +}; + + +// --- Pooling --- +struct MeanPooler { + template + static __device__ float apply(const float *val) { + const int size = pool_shape_h * pool_shape_w; + return StaticReduce::apply(val) / size; + } +}; + +struct MaxPooler { + template + static __device__ float apply(const float *val) { + return StaticReduce::apply(val); + } +}; + + + // --- Reader --- +class Tex1DReader { + cudaTextureObject_t m_tex; + int m_base_offset, m_chl_stride, m_row_stride, m_row_offset; + //size_t batch_, chal_, height_, weight_; + + public: + // Set attributes of texture Object + /*__device__ void init(cudaTextureObject_t& tex, + size_t batch, size_t chal, size_t height, size_t weight) { + batch_ = batch; + chal_ = chal; + height_ = height; + weight_ = weight; + m_chl_stride = height * weight; + m_row_stride = weight; + } + + __device__ void set_pos(cudaTextureObject_t& tex, + // Current position + size_t n, size_t c, size_t h, size_t w) { + m_tex = tex; + m_base_offset = ((n * chal_ + c) * height_ + h) * weight_ + w; + } + */ + __device__ void set_pos(cudaTextureObject_t& tex, + // Current position + int chal, int height, int weight, int n, int c, int h, int w) { + m_chl_stride = height * weight; + m_row_stride = weight; + m_tex = tex; + m_base_offset = ((n * chal + c) * height + h) * weight + w; + } + + __device__ void reset_row() { + m_row_offset = m_base_offset; + } + + __device__ void next_row() { + m_row_offset += m_row_stride; + } + + __device__ void next_channel() { + m_base_offset += m_chl_stride; + } + + __device__ float get(int /*dr*/, int dc) { + return tex1Dfetch(m_tex, dc + m_row_offset); + } + + __device__ float get(int idx) { + return tex1Dfetch(m_tex, idx + m_base_offset); + } +}; + + extern __host__ void create_cuda_tex(float *input, cudaTextureObject_t& tex, + size_t N, size_t IC, size_t IH, size_t IW); + + + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/config.json b/dnn/src/cuda/convpooling/kernel_impl/config.json new file mode 100644 index 00000000..9a4bd15f --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/config.json @@ -0,0 +1,7 @@ +{ + "templateFile":"kernel_impl.template", + "fileNamePrefix":"kernel_impl", + "kernelSize":"1", + "nonlineType":"Identity", + "nonlineTypeLower":"identity" +} diff --git a/dnn/src/cuda/convpooling/kernel_impl/generate_kernel.py b/dnn/src/cuda/convpooling/kernel_impl/generate_kernel.py new file mode 100755 index 00000000..67cd00d4 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/generate_kernel.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import json +import codecs + +def generate_code_file(): + # read config + config = {} + with codecs.open("config.json","rb","UTF-8") as f: + config = json.loads(f.read()) + if not config: + return + + # read template file + s = "" + template = config.get("templateFile") + with codecs.open(template, "rb", "UTF-8") as f: + s = f.read() + if not s: + return + s = s % config + + # save to file + fn = config["fileNamePrefix"] + with codecs.open(fn, "wb", "UTF-8") as f: + f.write(s) + f.flush() + +def generate_a_batch_of_code_file(): + # read config + config = {} + with codecs.open("config.json","rb","UTF-8") as f: + config = json.loads(f.read()) + if not config: + return + + # read template file + s_template = "" + template = config.get("templateFile") + with codecs.open(template, "rb", "UTF-8") as f: + s_template = f.read() + if not s_template: + return + + for i in range(1, 8): + config["kernelSize"] = str(i) + s = s_template % config + + # save to file + fn = config["fileNamePrefix"] + "_" +\ + config["nonlineTypeLower"] +\ + "_ksize" + str(i) + ".cu" + + print('generating {}...'.format(fn)) + + with codecs.open(fn, "wb", "UTF-8") as f: + f.write(s) + f.flush() +if __name__ == '__main__': + generate_a_batch_of_code_file() + try: + generate_code_file() + except Exception, ex: + print(ex) diff --git a/dnn/src/cuda/convpooling/kernel_impl/kern_corr_func_macro.inc b/dnn/src/cuda/convpooling/kernel_impl/kern_corr_func_macro.inc new file mode 100644 index 00000000..b1b5d8aa --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kern_corr_func_macro.inc @@ -0,0 +1,93 @@ +/* + * Format the definition of cuda kernel function "kern_xcorr_smallkern_pool" as + * a macro in order to generate a batch of definition-files. + * The original version of function "kern_xcorr_smallkern_pool" is in the file + * "src/cuda/convpooling/conv_pooling_tex.cu.bak" + */ + +#ifdef _MSC_VER +#define _Pragma __pragma +#endif + +#define KERN_CORR_DEFINE(Nonlin, kern_h, kern_w, pool_shape_h, pool_shape_w, \ + IdxGetter, Pooler) template<> \ + __global__ void kern_xcorr_smallkern_pool \ + ( \ + float *input, \ + const float *filter, \ + float *output, \ + const float *output_bias, \ + cudaTextureObject_t m_tex, \ + int IC, int IH, int IW, \ + int OH, int OW) { \ + const int\ + batch = blockIdx.x,\ + out_chl = blockIdx.y,\ + out_area2d = OH * OW,\ + out_pxl_start = (long long)blockIdx.z * out_area2d / gridDim.z,\ + out_pxl_end = (long long)(blockIdx.z + 1) * out_area2d / gridDim.z,\ + kern_volume = IC * (kern_h * kern_w),\ + thread_id = threadIdx.x,\ + nr_thread = blockDim.x,\ + pool_area = pool_shape_h * pool_shape_w;\ + const float bias = output_bias ? output_bias[out_chl] : 0; \ + const float* kernel_global = filter + out_chl * kern_volume;\ + extern __shared__ float kern[];\ + \ +\ + for (int i = thread_id; i < kern_volume; i += nr_thread)\ + kern[i] = kernel_global[i];\ + __syncthreads();\ +\ + float *output_ptr = output + (batch * gridDim.y + out_chl) \ + * out_area2d; \ +\ + Tex1DReader tex_reader;\ + for (int cur_out_pxl = out_pxl_start + thread_id;\ + cur_out_pxl < out_pxl_end;\ + cur_out_pxl += nr_thread) {\ + int ir_base = cur_out_pxl / OW * pool_shape_h,\ + ic_base = cur_out_pxl % OW * pool_shape_w;\ + tex_reader.set_pos(m_tex, IC, IH, IW, batch, 0, ir_base, ic_base);\ + float conv_sum[pool_area];\ +\ +_Pragma("unroll")\ + for (int i = 0; i < pool_area; i ++)\ + conv_sum[i] = bias;\ +\ + const float *kern_ptr = kern;\ + for (int ichl = 0; ichl < IC; ichl ++) {\ + tex_reader.reset_row();\ +_Pragma("unroll")\ + for (int ir = 0; ir < kern_h + pool_shape_h - 1; ir ++) {\ +_Pragma("unroll")\ + for (int ic = 0; ic < kern_w + pool_shape_w - 1; ic ++) {\ + float cur_input = tex_reader.get(ir, ic);\ +_Pragma("unroll")\ + for (int pr = 0; pr < pool_shape_h; pr ++) {\ +_Pragma("unroll")\ + for (int pc = 0; pc < pool_shape_w; pc ++) { \ + int kr = IdxGetter::apply(kern_h, ir, pr);\ + int kc = IdxGetter::apply(kern_w, ic, pc);\ +\ + if (kr >= 0 && kr < kern_h &&\ + kc >= 0 && kc < kern_w)\ + conv_sum[pr * pool_shape_w + pc] += \ + cur_input * kern_ptr[kr * kern_w + kc];\ +\ + } \ + }\ + }\ + tex_reader.next_row();\ + }\ + kern_ptr += kern_h * kern_w;\ + tex_reader.next_channel();\ + }\ + \ +_Pragma("unroll")\ + for (int i = 0; i < pool_area; i ++) {\ + conv_sum[i] = Nonlin::apply(conv_sum[i]);\ + }\ + output_ptr[cur_out_pxl] = Pooler::apply(conv_sum);\ + } \ +} diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl new file mode 100644 index 00000000..0352018c --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl @@ -0,0 +1,13 @@ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 1) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl.h b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl.h new file mode 100644 index 00000000..f56a339e --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl.h @@ -0,0 +1,49 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "../conv_pooling.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +typedef void (*kern_corr_pointer) (float *input, + const float *filter, + float *output, + const float *output_bias, + cudaTextureObject_t m_tex, + int IC, int IH, int IW, + int OH, int OW); + +#include "./kern_corr_func_macro.inc" + +#define DISPATCH_POOLMODE(nonlin, kern_size, pool_size, idx_getter) \ + KERN_CORR_DEFINE(nonlin, kern_size, kern_size, pool_size, pool_size, \ + idx_getter, MeanPooler) \ + KERN_CORR_DEFINE(nonlin, kern_size, kern_size, pool_size, pool_size, \ + idx_getter, MaxPooler) \ + + +#define DISPATCH_CONVMODE(nonlin, kern_size, pool_size) \ + DISPATCH_POOLMODE(nonlin, kern_size, pool_size, IdxGetterConvolution) \ + DISPATCH_POOLMODE(nonlin, kern_size, pool_size, IdxGetterCorrRel) \ + +#define DISPATCH_POOLSHAPE(nonlin, kern_size) \ + DISPATCH_CONVMODE(nonlin, kern_size, 1) \ + DISPATCH_CONVMODE(nonlin, kern_size, 2) \ + DISPATCH_CONVMODE(nonlin, kern_size, 3) \ + DISPATCH_CONVMODE(nonlin, kern_size, 4) + + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl.template b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl.template new file mode 100644 index 00000000..945427fb --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl.template @@ -0,0 +1,13 @@ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(%(nonlineType)s, %(kernelSize)s) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize1.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize1.cu new file mode 100644 index 00000000..084f54e2 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize1.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 1) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize2.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize2.cu new file mode 100644 index 00000000..a6cff432 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize2.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize2.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 2) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize3.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize3.cu new file mode 100644 index 00000000..80e5b9e2 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize3.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize3.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 3) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize4.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize4.cu new file mode 100644 index 00000000..fdb4e2f5 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize4.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 4) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize5.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize5.cu new file mode 100644 index 00000000..22bc80e6 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize5.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize5.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 5) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize6.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize6.cu new file mode 100644 index 00000000..cdc291ce --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize6.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize6.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 6) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize7.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize7.cu new file mode 100644 index 00000000..68a1f044 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize7.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_identity_ksize7.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Identity, 7) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize1.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize1.cu new file mode 100644 index 00000000..fcfa88b8 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize1.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 1) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize2.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize2.cu new file mode 100644 index 00000000..908df0db --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize2.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize2.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 2) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize3.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize3.cu new file mode 100644 index 00000000..79142abb --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize3.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize3.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 3) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize4.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize4.cu new file mode 100644 index 00000000..88256e69 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize4.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 4) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize5.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize5.cu new file mode 100644 index 00000000..eb14348b --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize5.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize5.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 5) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize6.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize6.cu new file mode 100644 index 00000000..1efdc713 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize6.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize6.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 6) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize7.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize7.cu new file mode 100644 index 00000000..d61455c4 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize7.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_relu_ksize7.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Relu, 7) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize1.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize1.cu new file mode 100644 index 00000000..3b24da14 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize1.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 1) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize2.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize2.cu new file mode 100644 index 00000000..dab6d3bc --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize2.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize2.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 2) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize3.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize3.cu new file mode 100644 index 00000000..9501d314 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize3.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize3.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 3) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize4.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize4.cu new file mode 100644 index 00000000..88487912 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize4.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 4) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize5.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize5.cu new file mode 100644 index 00000000..25b04e05 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize5.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize5.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 5) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize6.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize6.cu new file mode 100644 index 00000000..86121cb5 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize6.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize6.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 6) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize7.cu b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize7.cu new file mode 100644 index 00000000..58245640 --- /dev/null +++ b/dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize7.cu @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/convpooling/kernel_impl/kernel_impl_sigmoid_ksize7.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kernel_impl.h" +#include "../conv_pooling_utils.cuh" + +namespace megdnn { +namespace cuda { +namespace conv_pool { + +DISPATCH_POOLSHAPE(Sigmoid, 7) + +} // namespace conv_pool +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/opr_impl.cpp b/dnn/src/cuda/convpooling/opr_impl.cpp new file mode 100644 index 00000000..9ffa6e97 --- /dev/null +++ b/dnn/src/cuda/convpooling/opr_impl.cpp @@ -0,0 +1,216 @@ +/** + * \file dnn/src/cuda/convpooling/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/convpooling/opr_impl.h" +#include "src/cuda/convpooling/conv_pooling.h" +#include "src/cuda/utils.h" +#include "src/cuda/handle.h" + +namespace megdnn { +namespace cuda { +using namespace conv_pool; + +void get_dest_shape(size_t ih, size_t iw, size_t fh, size_t fw, + size_t sh, size_t sw, size_t ph, size_t pw, + size_t &oh, size_t &ow, bool is_floor = true) +{ + megdnn_assert(ih+2*ph >= fh, "input height=%zu, padding height=%zu, " + "filter height=%zu", ih, ph, fh); + megdnn_assert(iw+2*pw >= fw, "input width=%zu, padding width=%zu, " + "filter width=%zu", iw, pw, fw); + megdnn_assert(sh && sw, "invalid stride setting: (%zu, %zu)", sh, sw); + if (is_floor) { + oh = (ih+2*ph-fh)/sh + 1; + ow = (iw+2*pw-fw)/sw + 1; + } else { + oh = (ih+2*ph-fh+sh-1)/sh + 1; + ow = (iw+2*pw-fw+sw-1)/sw + 1; + } +} + +ConvPoolingForwardImpl::ConvPoolingForwardImpl(Handle *handle): + ConvPoolingForward(handle) { + return; +} + +size_t ConvPoolingForwardImpl::get_workspace_in_bytes(const TensorLayout & /*src*/, + const TensorLayout & /*filter*/, + const TensorLayout & /*bias*/, + const TensorLayout & /*dst*/) { + return 0; +} + +void ConvPoolingForwardImpl::deduce_layout( + const TensorLayout & srcl, + const TensorLayout & filterl, + const TensorLayout & /*bias*/, + TensorLayout & dstl) { + + megdnn_assert_contiguous(srcl); + megdnn_assert_contiguous(filterl); + auto &src = srcl.shape; + auto &filter = filterl.shape; + //auto &wsp = workspace.shape; + //wsp = TensorShape({0, 0, 0, 0}); + //megdnn_assert(src.ndim == 4_z, "%s", errmsg_c); + //megdnn_assert(filter.ndim == 4_z, "%s", errmsg_c); + megdnn_assert(srcl.ndim == 4_z, "%s", "src.ndim != 4"); + megdnn_assert(filterl.ndim == 4_z, "%s", "filter.ndim != 4"); + size_t n = src[0]; + size_t ic = src[1]; + size_t ih = src[2]; + size_t iw = src[3]; + size_t oc = filter[0]; + megdnn_assert(filter[1] == ic, "%s", "filter[1] != ic"); + size_t fh = filter[2]; + size_t fw = filter[3]; + size_t conv_sh = this->param().conv_stride_h; + size_t conv_sw = this->param().conv_stride_w; + size_t pool_sh = this->param().pool_stride_h; + size_t pool_sw = this->param().pool_stride_w; + size_t conv_ph = this->param().conv_pad_h; + size_t conv_pw = this->param().conv_pad_w; + size_t pool_ph = this->param().pool_pad_h; + size_t pool_pw = this->param().pool_pad_w; + size_t poolh = this->param().pool_shape_h; + size_t poolw = this->param().pool_shape_w; + size_t conv_oh, conv_ow, oh, ow; + // Shape of the output of convoluation. + get_dest_shape(ih, iw, fh, fw, conv_sh, conv_sw, + conv_ph, conv_pw, conv_oh, conv_ow); + // Shape of the output of pooling. + get_dest_shape(conv_oh, conv_ow, poolh, poolw, + pool_sh, pool_sw, pool_ph, pool_pw, oh, ow); + + dstl = TensorLayout(TensorShape{n, oc, oh, ow}, srcl.dtype); + //workspace = Workspace(NULL, 0); + //workspace.gen_default_stride(); +} + +void ConvPoolingForwardImpl::check_layout ( + const TensorLayout & src, + const TensorLayout & filter, + const TensorLayout & bias, + TensorLayout & dst, + size_t /* workspace_limit_in_bytes */ + ) { + + TensorLayout dst_expected; + deduce_layout(src, filter, bias, dst_expected); + megdnn_assert_eq_layout(dst_expected, dst); + + megdnn_assert(bias.shape[1] == dst.shape[1]); + megdnn_assert(dst.shape[1] == filter.shape[0]); +} + +void ConvPoolingForwardImpl::exec(const _megdnn_in TensorND src, + const _megdnn_in TensorND filter, + const _megdnn_in TensorND bias, + _megdnn_out TensorND dst, + _megdnn_out Workspace workspace) { + check_layout(src.layout, filter.layout, bias.layout, dst.layout, workspace.size); + auto stream = cuda_stream(this->handle()); + size_t N = src.layout.shape[0]; + size_t IC = src.layout.shape[1]; + size_t IH = src.layout.shape[2]; + size_t IW = src.layout.shape[3]; + size_t OC = dst.layout.shape[1]; + size_t OH = dst.layout.shape[2]; + size_t OW = dst.layout.shape[3]; + + size_t FH = filter.layout.shape[2]; + size_t FW = filter.layout.shape[3]; + size_t CONV_PH = this->param().conv_stride_h; + size_t CONV_PW = this->param().conv_stride_w; + size_t CONV_SH = this->param().conv_stride_h; + size_t CONV_SW = this->param().conv_stride_w; + size_t POOL_H = this->param().pool_shape_h; + size_t POOL_W = this->param().pool_shape_w; + + PoolModeCu poolMode; + switch(this->param().poolMode) { + case Param::PoolMode::AVERAGE: + poolMode = AVERAGE; + break; + case Param::PoolMode::MAX: + poolMode = MAX; + break; + default: + poolMode = AVERAGE; + } + + ConvModeCu convMode; + switch(this->param().convMode) { + case Param::ConvMode::CROSS_CORRELATION: + convMode = CROSS_CORRELATION; + break; + case Param::ConvMode::CONVOLUTION: + convMode = CONVOLUTION; + break; + default: + convMode = CROSS_CORRELATION; + } + + NonlineModeCu nonlineMode; + switch(this->param().nonlineMode) { + case Param::NonlineMode::IDENTITY: + nonlineMode = IDENTITY; + break; + case Param::NonlineMode::RELU: + nonlineMode = RELU; + break; + case Param::NonlineMode::SIGMOID: + nonlineMode = SIGMOID; + break; + default: + nonlineMode = IDENTITY; + } + + float *src_ptr = static_cast(src.raw_ptr), + *filter_ptr = static_cast(filter.raw_ptr), + *bias_ptr = static_cast(bias.raw_ptr), + *dst_ptr = static_cast(dst.raw_ptr); + + switch (this->param().method) { + case Param::Method::WITH_SHARED_MEM: + // This method is out-of-date. + /* + start_gpu_xcorr_pool_with_shared_mem(stream, src_ptr, filter_ptr, dst_ptr, + N, IC, IH, IW, OC, OH, OW, + FH, FW, CONV_PH, CONV_PW, CONV_SH, CONV_SW, + this->param().pool_shape_w, + poolMode, + this->param().relu, + bias_ptr); + + break; + */ + case Param::Method::WITH_TEXTURE_OBJ: + start_gpu_xcorr_pool_with_texture_obj(stream, src_ptr, filter_ptr, dst_ptr, + N, IC, IH, IW, OC, OH, OW, + FH, FW, CONV_PH, CONV_PW, CONV_SH, CONV_SW, + POOL_H, POOL_W, + poolMode, convMode, nonlineMode, bias_ptr); + break; + + default: + start_gpu_xcorr_pool_with_texture_obj(stream, src_ptr, filter_ptr, dst_ptr, + N, IC, IH, IW, OC, OH, OW, + FH, FW, CONV_PH, CONV_PW, CONV_SH, CONV_SW, + POOL_H, POOL_W, + poolMode, convMode, nonlineMode, bias_ptr); + } +} + + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convpooling/opr_impl.h b/dnn/src/cuda/convpooling/opr_impl.h new file mode 100644 index 00000000..bd2d3c57 --- /dev/null +++ b/dnn/src/cuda/convpooling/opr_impl.h @@ -0,0 +1,64 @@ +/** + * \file dnn/src/cuda/convpooling/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +// This method is out-of-date. +// Use shared memory to store (a part of) the input data. +/* +void start_gpu_xcorr_pool_with_shared_mem( + cudaStream_t stream, + float *input, + const float *kernel, + float *output, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t pool_shape, + PoolModeCu poolMode = AVERAGE, + bool relu = true, + const float *bias = NULL); +*/ + +class ConvPoolingForwardImpl final: public ConvPoolingForward { + public: + ConvPoolingForwardImpl(Handle *handle); + void exec( const _megdnn_in TensorND src, + const _megdnn_in TensorND filter, + const _megdnn_in TensorND bias, + _megdnn_out TensorND dst, + _megdnn_out Workspace workspace) override; + void deduce_layout( + const TensorLayout & src, + const TensorLayout & filter, + const TensorLayout & bias, + TensorLayout & dst) override; + void check_layout( + const TensorLayout & src, + const TensorLayout & filter, + const TensorLayout & bias, + TensorLayout & dst, + size_t workspace_limit_in_bytes) override; + size_t get_workspace_in_bytes(const TensorLayout & src, + const TensorLayout & filter, + const TensorLayout & bias, + const TensorLayout & dst) override; +}; + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/cuda/cub/LICENCE b/dnn/src/cuda/cub/LICENCE new file mode 100644 index 00000000..6aeea8da --- /dev/null +++ b/dnn/src/cuda/cub/LICENCE @@ -0,0 +1,24 @@ +Copyright (c) 2010-2011, Duane Merrill. All rights reserved. +Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/dnn/src/cuda/cub/agent/agent_histogram.cuh b/dnn/src/cuda/cub/agent/agent_histogram.cuh new file mode 100644 index 00000000..37b1ec97 --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_histogram.cuh @@ -0,0 +1,787 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + */ + +#pragma once + +#include + +#include "../util_type.cuh" +#include "../block/block_load.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * + */ +enum BlockHistogramMemoryPreference +{ + GMEM, + SMEM, + BLEND +}; + + +/** + * Parameterizable tuning policy type for AgentHistogram + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming + BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue +struct AgentHistogramPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) + IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming + MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. + int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading samples + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT, ///< Signed integer type for global offsets + int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability +struct AgentHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + /// The pixel type of SampleT + typedef typename CubVector::Type PixelT; + + /// The quad type of SampleT + typedef typename CubVector::Type QuadT; + + /// Constants + enum + { + BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, + + PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, + SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, + QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, + + TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, + TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, + + IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, + + MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? + AgentHistogramPolicyT::MEM_PREFERENCE : + GMEM, + + IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, + }; + + /// Cache load modifier for reading input elements + static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; + + + /// Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + SampleIteratorT>::Type // Directly use the supplied input iterator type + WrappedSampleIteratorT; + + /// Pixel input iterator type (for applying cache modifier) + typedef CacheModifiedInputIterator + WrappedPixelIteratorT; + + /// Qaud input iterator type (for applying cache modifier) + typedef CacheModifiedInputIterator + WrappedQuadIteratorT; + + /// Parameterized BlockLoad type for samples + typedef BlockLoad< + SampleT, + BLOCK_THREADS, + SAMPLES_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadSampleT; + + /// Parameterized BlockLoad type for pixels + typedef BlockLoad< + PixelT, + BLOCK_THREADS, + PIXELS_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadPixelT; + + /// Parameterized BlockLoad type for quads + typedef BlockLoad< + QuadT, + BLOCK_THREADS, + QUADS_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadQuadT; + + /// Shared memory type required by this thread block + struct _TempStorage + { + CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) + + int tile_idx; + + // Aliasable storage layout + union Aliasable + { + typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples + typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels + typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads + + } aliasable; + }; + + + /// Temporary storage type (unionable) + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Sample input iterator (with cache modifier applied, if possible) + WrappedSampleIteratorT d_wrapped_samples; + + /// Native pointer for input samples (possibly NULL if unavailable) + SampleT* d_native_samples; + + /// The number of output bins for each channel + int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; + + /// The number of privatized bins for each channel + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; + + /// Reference to gmem privatized histograms for each channel + CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; + + /// Reference to final output histograms (gmem) + CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; + + /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; + + /// The transform operator for determining privatized counter indices from samples, one for each channel + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; + + /// Whether to prefer privatized smem counters vs privatized global counters + bool prefer_smem; + + + //--------------------------------------------------------------------- + // Initialize privatized bin counters + //--------------------------------------------------------------------- + + // Initialize privatized bin counters + __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) + { + // Initialize histogram bin counts to zeros + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) + { + privatized_histograms[CHANNEL][privatized_bin] = 0; + } + } + + // Barrier to make sure all threads are done updating counters + CTA_SYNC(); + } + + + // Initialize privatized bin counters. Specialized for privatized shared-memory counters + __device__ __forceinline__ void InitSmemBinCounters() + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + InitBinCounters(privatized_histograms); + } + + + // Initialize privatized bin counters. Specialized for privatized global-memory counters + __device__ __forceinline__ void InitGmemBinCounters() + { + InitBinCounters(d_privatized_histograms); + } + + + //--------------------------------------------------------------------- + // Update final output histograms + //--------------------------------------------------------------------- + + // Update final output histograms from privatized histograms + __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) + { + // Barrier to make sure all threads are done updating counters + CTA_SYNC(); + + // Apply privatized bin counts to output bin counts + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_bins = num_privatized_bins[CHANNEL]; + for (int privatized_bin = threadIdx.x; + privatized_bin < channel_bins; + privatized_bin += BLOCK_THREADS) + { + int output_bin = -1; + CounterT count = privatized_histograms[CHANNEL][privatized_bin]; + bool is_valid = count > 0; + + output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); + + if (output_bin >= 0) + { + atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); + } + + } + } + } + + + // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters + __device__ __forceinline__ void StoreSmemOutput() + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + StoreOutput(privatized_histograms); + } + + + // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters + __device__ __forceinline__ void StoreGmemOutput() + { + StoreOutput(d_privatized_histograms); + } + + + //--------------------------------------------------------------------- + // Tile accumulation + //--------------------------------------------------------------------- + + // Accumulate pixels. Specialized for RLE compression. + __device__ __forceinline__ void AccumulatePixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD], + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], + Int2Type is_rle_compress) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + // Bin pixels + int bins[PIXELS_PER_THREAD]; + + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + bins[PIXEL] = -1; + privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); + } + + CounterT accumulator = 1; + + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) + { + if (bins[PIXEL] != bins[PIXEL + 1]) + { + if (bins[PIXEL] >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); + + accumulator = 0; + } + accumulator++; + } + + // Last pixel + if (bins[PIXELS_PER_THREAD - 1] >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); + } + } + + + // Accumulate pixels. Specialized for individual accumulation of each pixel. + __device__ __forceinline__ void AccumulatePixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD], + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], + Int2Type is_rle_compress) + { + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + int bin = -1; + privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); + if (bin >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bin, 1); + } + } + } + + + /** + * Accumulate pixel, specialized for smem privatized histogram + */ + __device__ __forceinline__ void AccumulateSmemPixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD]) + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); + } + + + /** + * Accumulate pixel, specialized for gmem privatized histogram + */ + __device__ __forceinline__ void AccumulateGmemPixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD]) + { + AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); + } + + + + //--------------------------------------------------------------------- + // Tile loading + //--------------------------------------------------------------------- + + // Load full, aligned tile using pixel iterator (multi-channel) + template + __device__ __forceinline__ void LoadFullAlignedTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) + { + typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; + + WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); + + // Load using a wrapped pixel iterator + BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( + d_wrapped_pixels, + reinterpret_cast(samples)); + } + + // Load full, aligned tile using quad iterator (single-channel) + __device__ __forceinline__ void LoadFullAlignedTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type<1> num_active_channels) + { + typedef QuadT AliasedQuads[QUADS_PER_THREAD]; + + WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); + + // Load using a wrapped quad iterator + BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( + d_wrapped_quads, + reinterpret_cast(samples)); + } + + // Load full, aligned tile + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); + } + + // Load full, mis-aligned tile using sample iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; + + // Load using sample iterator + BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( + d_wrapped_samples + block_offset, + reinterpret_cast(samples)); + } + + // Load partially-full, aligned tile using the pixel iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; + + WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); + + int valid_pixels = valid_samples / NUM_CHANNELS; + + // Load using a wrapped pixel iterator + BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( + d_wrapped_pixels, + reinterpret_cast(samples), + valid_pixels); + } + + // Load partially-full, mis-aligned tile using sample iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; + + BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( + d_wrapped_samples + block_offset, + reinterpret_cast(samples), + valid_samples); + } + + + //--------------------------------------------------------------------- + // Tile processing + //--------------------------------------------------------------------- + + // Consume a tile of data samples + template < + bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) + bool IS_FULL_TILE> // Whether the tile is full + __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) + { + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; + bool is_valid[PIXELS_PER_THREAD]; + + // Load tile + LoadTile( + block_offset, + valid_samples, + samples, + Int2Type(), + Int2Type()); + + // Set valid flags + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); + + // Accumulate samples +#if CUB_PTX_ARCH >= 120 + if (prefer_smem) + AccumulateSmemPixels(samples, is_valid); + else + AccumulateGmemPixels(samples, is_valid); +#else + AccumulateGmemPixels(samples, is_valid); +#endif + + } + + + // Consume row tiles. Specialized for work-stealing from queue + template + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue, + Int2Type is_work_stealing) + { + + int num_tiles = num_rows * tiles_per_row; + int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; + OffsetT num_even_share_tiles = gridDim.x * gridDim.y; + + while (tile_idx < num_tiles) + { + int row = tile_idx / tiles_per_row; + int col = tile_idx - (row * tiles_per_row); + OffsetT row_offset = row * row_stride_samples; + OffsetT col_offset = (col * TILE_SAMPLES); + OffsetT tile_offset = row_offset + col_offset; + + if (col == tiles_per_row - 1) + { + // Consume a partially-full tile at the end of the row + OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; + ConsumeTile(tile_offset, num_remaining); + } + else + { + // Consume full tile + ConsumeTile(tile_offset, TILE_SAMPLES); + } + + CTA_SYNC(); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; + + CTA_SYNC(); + + tile_idx = temp_storage.tile_idx; + } + } + + + // Consume row tiles. Specialized for even-share (striped across thread blocks) + template + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue, + Int2Type is_work_stealing) + { + for (int row = blockIdx.y; row < num_rows; row += gridDim.y) + { + OffsetT row_begin = row * row_stride_samples; + OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); + OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); + + while (tile_offset < row_end) + { + OffsetT num_remaining = row_end - tile_offset; + + if (num_remaining < TILE_SAMPLES) + { + // Consume partial tile + ConsumeTile(tile_offset, num_remaining); + break; + } + + // Consume full tile + ConsumeTile(tile_offset, TILE_SAMPLES); + tile_offset += gridDim.x * TILE_SAMPLES; + } + } + } + + + //--------------------------------------------------------------------- + // Parameter extraction + //--------------------------------------------------------------------- + + // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) + template < + CacheLoadModifier _MODIFIER, + typename _ValueT, + typename _OffsetT> + __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) + { + return itr.ptr; + } + + // Return a native pixel pointer (specialized for other types) + template + __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) + { + return NULL; + } + + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + + /** + * Constructor + */ + __device__ __forceinline__ AgentHistogram( + TempStorage &temp_storage, ///< Reference to temp_storage + SampleIteratorT d_samples, ///< Input data to reduce + int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram + CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms + CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel + : + temp_storage(temp_storage.Alias()), + d_wrapped_samples(d_samples), + num_output_bins(num_output_bins), + num_privatized_bins(num_privatized_bins), + d_output_histograms(d_output_histograms), + privatized_decode_op(privatized_decode_op), + output_decode_op(output_decode_op), + d_native_samples(NativePointer(d_wrapped_samples)), + prefer_smem((MEM_PREFERENCE == SMEM) ? + true : // prefer smem privatized histograms + (MEM_PREFERENCE == GMEM) ? + false : // prefer gmem privatized histograms + blockIdx.x & 1) // prefer blended privatized histograms + { + int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; + + // Initialize the locations of this block's privatized histograms + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); + } + + + /** + * Consume image + */ + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks + { + // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) + int quad_mask = AlignBytes::ALIGN_BYTES - 1; + int pixel_mask = AlignBytes::ALIGN_BYTES - 1; + size_t row_bytes = sizeof(SampleT) * row_stride_samples; + + bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel + ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned + ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad + + bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel + ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned + ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel + + // Whether rows are aligned and can be vectorized + if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) + ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); + else + ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); + } + + + /** + * Initialize privatized bin counters. Specialized for privatized shared-memory counters + */ + __device__ __forceinline__ void InitBinCounters() + { + if (prefer_smem) + InitSmemBinCounters(); + else + InitGmemBinCounters(); + } + + + /** + * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters + */ + __device__ __forceinline__ void StoreOutput() + { + if (prefer_smem) + StoreSmemOutput(); + else + StoreGmemOutput(); + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_radix_sort_downsweep.cuh b/dnn/src/cuda/cub/agent/agent_radix_sort_downsweep.cuh new file mode 100644 index 00000000..faea8813 --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_radix_sort_downsweep.cuh @@ -0,0 +1,789 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + */ + + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_radix_rank.cuh" +#include "../block/block_exchange.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Radix ranking algorithm + */ +enum RadixRankAlgorithm +{ + RADIX_RANK_BASIC, + RADIX_RANK_MEMOIZE, + RADIX_RANK_MATCH +}; + +/** + * Parameterizable tuning policy type for AgentRadixSortDownsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) + RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct AgentRadixSortDownsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) + static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + + + + +/** + * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + */ +template < + typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< KeyT type + typename ValueT, ///< ValueT type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRadixSortDownsweep +{ + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + // Appropriate unsigned-bits representation of KeyT + typedef typename Traits::UnsignedBits UnsignedBits; + + static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY; + static const UnsignedBits MAX_KEY = Traits::MAX_KEY; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; + static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; + static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; + static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; + + enum + { + BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, + RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Input iterator wrapper type (for applying cache modifier)s + typedef CacheModifiedInputIterator KeysItr; + typedef CacheModifiedInputIterator ValuesItr; + + // Radix ranking type to use + typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC), + BlockRadixRank, + typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + BlockRadixRank, + BlockRadixRankMatch + >::Type + >::Type BlockRadixRankT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD + }; + + // BlockLoad type (keys) + typedef BlockLoad< + UnsignedBits, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM> BlockLoadKeysT; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM> BlockLoadValuesT; + + // Value exchange array type + typedef ValueT ValueExchangeT[TILE_ITEMS]; + + /** + * Shared memory storage layout + */ + union __align__(16) _TempStorage + { + typename BlockLoadKeysT::TempStorage load_keys; + typename BlockLoadValuesT::TempStorage load_values; + typename BlockRadixRankT::TempStorage radix_rank; + + struct + { + UnsignedBits exchange_keys[TILE_ITEMS]; + OffsetT relative_bin_offsets[RADIX_DIGITS]; + }; + + Uninitialized exchange_values; + + OffsetT exclusive_digit_prefix[RADIX_DIGITS]; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Input and output device pointers + KeysItr d_keys_in; + ValuesItr d_values_in; + UnsignedBits *d_keys_out; + ValueT *d_values_out; + + // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + // Whether to short-cirucit + int short_circuit; + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + + /** + * Scatter ranked keys through shared memory, then to device-accessible memory + */ + template + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT valid_items) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; + UnsignedBits digit = BFE(key, current_bit, num_bits); + relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit]; + + // Un-twiddle + key = Traits::TwiddleOut(key); + + if (FULL_TILE || + (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) + { + d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; + } + } + } + + + /** + * Scatter ranked values through shared memory, then to device-accessible memory + */ + template + __device__ __forceinline__ void ScatterValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT valid_items) + { + CTA_SYNC(); + + ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + exchange_values[ranks[ITEM]] = values[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; + + if (FULL_TILE || + (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) + { + d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; + } + } + } + + /** + * Load a tile of keys (specialized for full tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadKeysT(temp_storage.load_keys).Load( + d_keys_in + block_offset, keys); + + CTA_SYNC(); + } + + + /** + * Load a tile of keys (specialized for partial tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + BlockLoadKeysT(temp_storage.load_keys).Load( + d_keys_in + block_offset, keys, valid_items, oob_item); + + CTA_SYNC(); + } + + + /** + * Load a tile of keys (specialized for full tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); + } + + + /** + * Load a tile of keys (specialized for partial tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); + } + + + /** + * Load a tile of values (specialized for full tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadValuesT(temp_storage.load_values).Load( + d_values_in + block_offset, values); + + CTA_SYNC(); + } + + + /** + * Load a tile of values (specialized for partial tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + BlockLoadValuesT(temp_storage.load_values).Load( + d_values_in + block_offset, values, valid_items); + + CTA_SYNC(); + } + + + /** + * Load a tile of items (specialized for full tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); + } + + + /** + * Load a tile of items (specialized for partial tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); + + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); + } + + + /** + * Truck along associated values + */ + template + __device__ __forceinline__ void GatherScatterValues( + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type /*is_keys_only*/) + { + ValueT values[ITEMS_PER_THREAD]; + + CTA_SYNC(); + + LoadValues( + values, + block_offset, + valid_items, + Int2Type(), + Int2Type()); + + ScatterValues( + values, + relative_bin_offsets, + ranks, + valid_items); + } + + + /** + * Truck along associated values (specialized for key-only sorting) + */ + template + __device__ __forceinline__ void GatherScatterValues( + OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD], + int (&/*ranks*/)[ITEMS_PER_THREAD], + OffsetT /*block_offset*/, + OffsetT /*valid_items*/, + Int2Type /*is_keys_only*/) + {} + + + /** + * Process tile + */ + template + __device__ __forceinline__ void ProcessTile( + OffsetT block_offset, + const OffsetT &valid_items = TILE_ITEMS) + { + UnsignedBits keys[ITEMS_PER_THREAD]; + int ranks[ITEMS_PER_THREAD]; + OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; + + // Assign default (min/max) value to all keys + UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY; + + // Load tile of keys + LoadKeys( + keys, + block_offset, + valid_items, + default_key, + Int2Type(), + Int2Type()); + + // Twiddle key bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + keys[KEY] = Traits::TwiddleIn(keys[KEY]); + } + + // Rank the twiddled keys + int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; + BlockRadixRankT(temp_storage.radix_rank).RankKeys( + keys, + ranks, + current_bit, + num_bits, + exclusive_digit_prefix); + + CTA_SYNC(); + + // Share exclusive digit prefix + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + // Store exclusive prefix + temp_storage.exclusive_digit_prefix[bin_idx] = + exclusive_digit_prefix[track]; + } + } + + CTA_SYNC(); + + // Get inclusive digit prefix + int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + { + // Get inclusive digit prefix from exclusive prefix (higher bins come first) + inclusive_digit_prefix[track] = (bin_idx == 0) ? + (BLOCK_THREADS * ITEMS_PER_THREAD) : + temp_storage.exclusive_digit_prefix[bin_idx - 1]; + } + else + { + // Get inclusive digit prefix from exclusive prefix (lower bins come first) + inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? + (BLOCK_THREADS * ITEMS_PER_THREAD) : + temp_storage.exclusive_digit_prefix[bin_idx + 1]; + } + } + } + + CTA_SYNC(); + + // Update global scatter base offsets for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + bin_offset[track] -= exclusive_digit_prefix[track]; + temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track]; + bin_offset[track] += inclusive_digit_prefix[track]; + } + } + + CTA_SYNC(); + + // Scatter keys + ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); + + // Gather/scatter values + GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type()); + } + + //--------------------------------------------------------------------- + // Copy shortcut + //--------------------------------------------------------------------- + + /** + * Copy tiles within the range of input + */ + template < + typename InputIteratorT, + typename T> + __device__ __forceinline__ void Copy( + InputIteratorT d_in, + T *d_out, + OffsetT block_offset, + OffsetT block_end) + { + // Simply copy the input + while (block_offset + TILE_ITEMS <= block_end) + { + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items); + CTA_SYNC(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items); + + block_offset += TILE_ITEMS; + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + OffsetT valid_items = block_end - block_offset; + + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); + CTA_SYNC(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); + } + } + + + /** + * Copy tiles within the range of input (specialized for NullType) + */ + template + __device__ __forceinline__ void Copy( + InputIteratorT /*d_in*/, + NullType * /*d_out*/, + OffsetT /*block_offset*/, + OffsetT /*block_end*/) + {} + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortDownsweep( + TempStorage &temp_storage, + OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], + OffsetT num_items, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_values_in(d_values_in), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(1) + { + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + this->bin_offset[track] = bin_offset[track]; + + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + // Short circuit if the histogram has only bin counts of only zeros or problem-size + short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); + } + } + + short_circuit = CTA_SYNC_AND(short_circuit); + } + + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortDownsweep( + TempStorage &temp_storage, + OffsetT num_items, + OffsetT *d_spine, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_values_in(d_values_in), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(1) + { + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size + OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; + short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); + + // Load my block's bin offset for my bin + bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; + } + } + + short_circuit = CTA_SYNC_AND(short_circuit); + } + + + /** + * Distribute keys from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + OffsetT block_offset, + OffsetT block_end) + { + if (short_circuit) + { + // Copy keys + Copy(d_keys_in, d_keys_out, block_offset, block_end); + + // Copy values + Copy(d_values_in, d_values_out, block_offset, block_end); + } + else + { + // Process full tiles of tile_items + #pragma unroll 1 + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessTile(block_offset); + block_offset += TILE_ITEMS; + + CTA_SYNC(); + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + ProcessTile(block_offset, block_end - block_offset); + } + + } + } + +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_radix_sort_upsweep.cuh b/dnn/src/cuda/cub/agent/agent_radix_sort_upsweep.cuh new file mode 100644 index 00000000..2081cefb --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_radix_sort_upsweep.cuh @@ -0,0 +1,526 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + */ + +#pragma once + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_load.cuh" +#include "../warp/warp_reduce.cuh" +#include "../block/block_load.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentRadixSortUpsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct AgentRadixSortUpsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + */ +template < + typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type + typename KeyT, ///< KeyT type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRadixSortUpsweep +{ + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + typedef typename Traits::UnsignedBits UnsignedBits; + + // Integer type for digit counters (to be packed into words of PackedCounters) + typedef unsigned char DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef unsigned int PackedCounter; + + static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; + + enum + { + RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, + BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, + KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // To prevent counter overflow, we must periodically unpack and aggregate the + // digit counters back into registers. Each counter lane is assigned to a + // warp for aggregation. + + LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), + + // Unroll tiles in batches without risk of counter overflow + UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), + UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, + }; + + + // Input iterator wrapper type (for applying cache modifier)s + typedef CacheModifiedInputIterator KeysItr; + + /** + * Shared memory storage layout + */ + union __align__(16) _TempStorage + { + DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; + OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields (aggregate state bundle) + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Thread-local counters for periodically aggregating composite-counter lanes + OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; + + // Input and output device pointers + KeysItr d_keys_in; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + + + //--------------------------------------------------------------------- + // Helper structure for templated iteration + //--------------------------------------------------------------------- + + // Iterate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys( + AgentRadixSortUpsweep &cta, + UnsignedBits keys[KEYS_PER_THREAD]) + { + cta.Bucket(keys[COUNT]); + + // Next + Iterate::BucketKeys(cta, keys); + } + }; + + // Terminate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {} + }; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decode a key and increment corresponding smem digit counter + */ + __device__ __forceinline__ void Bucket(UnsignedBits key) + { + // Perform transform op + UnsignedBits converted_key = Traits::TwiddleIn(key); + + // Extract current digit bits + UnsignedBits digit = BFE(converted_key, current_bit, num_bits); + + // Get sub-counter offset + UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); + + // Get row offset + UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; + + // Increment counter + temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; + } + + + /** + * Reset composite counters + */ + __device__ __forceinline__ void ResetDigitCounters() + { + #pragma unroll + for (int LANE = 0; LANE < COUNTER_LANES; LANE++) + { + temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; + } + } + + + /** + * Reset the unpacked counters in each thread + */ + __device__ __forceinline__ void ResetUnpackedCounters() + { + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + local_counts[LANE][UNPACKED_COUNTER] = 0; + } + } + } + + + /** + * Extracts and aggregates the digit counters for each counter lane + * owned by this warp + */ + __device__ __forceinline__ void UnpackDigitCounts() + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + const int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + #pragma unroll + for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; + local_counts[LANE][UNPACKED_COUNTER] += counter; + } + } + } + } + } + + + /** + * Processes a single, full tile + */ + __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset) + { + // Tile of keys + UnsignedBits keys[KEYS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); + + // Prevent hoisting + CTA_SYNC(); + + // Bucket tile of keys + Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); + } + + + /** + * Processes a single load (may have some threads masked off) + */ + __device__ __forceinline__ void ProcessPartialTile( + OffsetT block_offset, + const OffsetT &block_end) + { + // Process partial tile if necessary using single loads + block_offset += threadIdx.x; + while (block_offset < block_end) + { + // Load and bucket key + UnsignedBits key = d_keys_in[block_offset]; + Bucket(key); + block_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortUpsweep( + TempStorage &temp_storage, + const KeyT *d_keys_in, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + current_bit(current_bit), + num_bits(num_bits) + {} + + + /** + * Compute radix digit histograms from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + OffsetT block_offset, + const OffsetT &block_end) + { + // Reset digit counters in smem and unpacked counters in registers + ResetDigitCounters(); + ResetUnpackedCounters(); + + // Unroll batches of full tiles + while (block_offset + UNROLLED_ELEMENTS <= block_end) + { + for (int i = 0; i < UNROLL_COUNT; ++i) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + CTA_SYNC(); + + // Aggregate back into local_count registers to prevent overflow + UnpackDigitCounts(); + + CTA_SYNC(); + + // Reset composite counters in lanes + ResetDigitCounters(); + } + + // Unroll single full tiles + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Process partial tile if necessary + ProcessPartialTile( + block_offset, + block_end); + + CTA_SYNC(); + + // Aggregate back into local_count registers + UnpackDigitCounts(); + } + + + /** + * Extract counts (saving them to the external array) + */ + template + __device__ __forceinline__ void ExtractCounts( + OffsetT *counters, + int bin_stride = 1, + int bin_offset = 0) + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + int bin_idx = digit_row + UNPACKED_COUNTER; + + temp_storage.block_counters[warp_tid][bin_idx] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + CTA_SYNC(); + + // Rake-reduce bin_count reductions + + // Whole blocks + #pragma unroll + for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; + (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; + BIN_BASE += BLOCK_THREADS) + { + int bin_idx = BIN_BASE + threadIdx.x; + + OffsetT bin_count = 0; + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count += temp_storage.block_counters[i][bin_idx]; + + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + counters[(bin_stride * bin_idx) + bin_offset] = bin_count; + } + + // Remainder + if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) + { + int bin_idx = threadIdx.x; + + OffsetT bin_count = 0; + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count += temp_storage.block_counters[i][bin_idx]; + + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + counters[(bin_stride * bin_idx) + bin_offset] = bin_count; + } + } + + + /** + * Extract counts + */ + template + __device__ __forceinline__ void ExtractCounts( + OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + int bin_idx = digit_row + UNPACKED_COUNTER; + + temp_storage.block_counters[warp_tid][bin_idx] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + CTA_SYNC(); + + // Rake-reduce bin_count reductions + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + bin_count[track] = 0; + + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count[track] += temp_storage.block_counters[i][bin_idx]; + } + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_reduce.cuh b/dnn/src/cuda/cub/agent/agent_reduce.cuh new file mode 100644 index 00000000..000a905c --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_reduce.cuh @@ -0,0 +1,385 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + */ + +#pragma once + +#include + +#include "../block/block_load.cuh" +#include "../block/block_reduce.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_even_share.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentReduce + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use + CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements +struct AgentReducePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + * + * Each thread reduces only the values it loads. If \p FIRST_TILE, this + * partial reduction is stored into \p thread_aggregate. Otherwise it is + * accumulated into \p thread_aggregate. + */ +template < + typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type + typename InputIteratorT, ///< Random-access iterator type for input + typename OutputIteratorT, ///< Random-access iterator type for output + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) +struct AgentReduce +{ + + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + typedef typename std::iterator_traits::value_type InputT; + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + /// Vector type of InputT for data movement + typedef typename CubVector::Type VectorT; + + /// Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + /// Constants + enum + { + BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type + ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && + (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && + (IsPointer::VALUE) && Traits::PRIMITIVE, + + }; + + static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; + static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; + + /// Parameterized BlockReduce primitive + typedef BlockReduce BlockReduceT; + + /// Shared memory type required by this thread block + struct _TempStorage + { + typename BlockReduceT::TempStorage reduce; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + InputIteratorT d_in; ///< Input data to reduce + WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + + + //--------------------------------------------------------------------- + // Utility + //--------------------------------------------------------------------- + + + // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type /*can_vectorize*/) + { + return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + } + + // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator /*d_in*/, + Int2Type /*can_vectorize*/) + { + return false; + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentReduce( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data to reduce + ReductionOp reduction_op) ///< Binary reduction operator + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_wrapped_in(d_in), + reduction_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Tile consumption + //--------------------------------------------------------------------- + + /** + * Consume a full tile of input (non-vectorized) + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int /*valid_items*/, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + OutputT items[ITEMS_PER_THREAD]; + + // Load items in striped fashion + LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); + + // Reduce items within each thread stripe + thread_aggregate = (IS_FIRST_TILE) ? + internal::ThreadReduce(items, reduction_op) : + internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + + /** + * Consume a full tile of input (vectorized) + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int /*valid_items*/, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + // Alias items as an array of VectorT and load it in striped fashion + enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; + + // Fabricate a vectorized input iterator + InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); + CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in_unqualified)); + + // Load items as vector items + InputT input_items[ITEMS_PER_THREAD]; + VectorT *vec_items = reinterpret_cast(input_items); + #pragma unroll + for (int i = 0; i < WORDS; ++i) + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; + + // Convert from input type to output type + OutputT items[ITEMS_PER_THREAD]; + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + items[i] = input_items[i]; + + // Reduce items within each thread stripe + thread_aggregate = (IS_FIRST_TILE) ? + internal::ThreadReduce(items, reduction_op) : + internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + + /** + * Consume a partial tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int valid_items, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + // Partial tile + int thread_offset = threadIdx.x; + + // Read first item + if ((IS_FIRST_TILE) && (thread_offset < valid_items)) + { + thread_aggregate = d_wrapped_in[block_offset + thread_offset]; + thread_offset += BLOCK_THREADS; + } + + // Continue reading items (block-striped) + while (thread_offset < valid_items) + { + OutputT item = d_wrapped_in[block_offset + thread_offset]; + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------- + // Consume a contiguous segment of tiles + //--------------------------------------------------------------------- + + /** + * \brief Reduce a contiguous segment of input tiles + */ + template + __device__ __forceinline__ OutputT ConsumeRange( + GridEvenShare &even_share, ///< GridEvenShare descriptor + Int2Type can_vectorize) ///< Whether or not we can vectorize loads + { + OutputT thread_aggregate; + + if (even_share.block_offset + TILE_ITEMS > even_share.block_end) + { + // First tile isn't full (not all threads have valid items) + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); + return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); + } + + // At least one full block + ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + even_share.block_offset += even_share.block_stride; + + // Consume subsequent full tiles of input + while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) + { + ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + even_share.block_offset += even_share.block_stride; + } + + // Consume a partially-full tile + if (even_share.block_offset < even_share.block_end) + { + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); + } + + // Compute block-wide reduction (all threads have valid items) + return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); + } + + + /** + * \brief Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ OutputT ConsumeRange( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + { + GridEvenShare even_share; + even_share.template BlockInit(block_offset, block_end); + + return (IsAligned(d_in + block_offset, Int2Type())) ? + ConsumeRange(even_share, Int2Type()) : + ConsumeRange(even_share, Int2Type()); + } + + + /** + * Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ OutputT ConsumeTiles( + GridEvenShare &even_share) ///< [in] GridEvenShare descriptor + { + // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block + even_share.template BlockInit(); + + return (IsAligned(d_in, Int2Type())) ? + ConsumeRange(even_share, Int2Type()) : + ConsumeRange(even_share, Int2Type()); + + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_reduce_by_key.cuh b/dnn/src/cuda/cub/agent/agent_reduce_by_key.cuh new file mode 100644 index 00000000..51964d3e --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_reduce_by_key.cuh @@ -0,0 +1,547 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_discontinuity.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentReduceByKey + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentReduceByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentReduceByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair OffsetValuePairT; + + // Tuple type for pairing keys and values + typedef KeyValuePair KeyValuePairT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Guarded inequality functor + template + struct GuardedInequalityWrapper + { + _EqualityOpT op; ///< Wrapped equality operator + int num_remaining; ///< Items remaining + + /// Constructor + __host__ __device__ __forceinline__ + GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const + { + if (idx < num_remaining) + return !op(a, b); // In bounds + + // Return true if first out-of-bounds item, false otherwise + return (idx == num_remaining); + } + }; + + + // Constants + enum + { + BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + KeysInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedKeysInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + ValuesInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedValuesInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFixupInputIteratorT; + + // Reduce-value-by-segment scan operator + typedef ReduceBySegmentOp ReduceBySegmentOpT; + + // Parameterized BlockLoad type for keys + typedef BlockLoad< + KeyOutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentReduceByKeyPolicyT::LOAD_ALGORITHM> + BlockLoadKeysT; + + // Parameterized BlockLoad type for values + typedef BlockLoad< + ValueOutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentReduceByKeyPolicyT::LOAD_ALGORITHM> + BlockLoadValuesT; + + // Parameterized BlockDiscontinuity type for keys + typedef BlockDiscontinuity< + KeyOutputT, + BLOCK_THREADS> + BlockDiscontinuityKeys; + + // Parameterized BlockScan type + typedef BlockScan< + OffsetValuePairT, + BLOCK_THREADS, + AgentReduceByKeyPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OffsetValuePairT, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Key and value exchange types + typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; + typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1]; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for loading keys + typename BlockLoadKeysT::TempStorage load_keys; + + // Smem needed for loading values + typename BlockLoadValuesT::TempStorage load_values; + + // Smem needed for compacting key value pairs(allows non POD items in this union) + Uninitialized raw_exchange; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedKeysInputIteratorT d_keys_in; ///< Input keys + UniqueOutputIteratorT d_unique_out; ///< Unique output keys + WrappedValuesInputIteratorT d_values_in; ///< Input values + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + NumRunsOutputIteratorT d_num_runs_out; ///< Output pointer for total number of segments identified + EqualityOpT equality_op; ///< KeyT equality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentReduceByKey( + TempStorage& temp_storage, ///< Reference to temp_storage + KeysInputIteratorT d_keys_in, ///< Input keys + UniqueOutputIteratorT d_unique_out, ///< Unique output keys + ValuesInputIteratorT d_values_in, ///< Input values + AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates + NumRunsOutputIteratorT d_num_runs_out, ///< Output pointer for total number of segments identified + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op) ///< ValueT reduction operator + : + temp_storage(temp_storage.Alias()), + d_keys_in(d_keys_in), + d_unique_out(d_unique_out), + d_values_in(d_values_in), + d_aggregates_out(d_aggregates_out), + d_num_runs_out(d_num_runs_out), + equality_op(equality_op), + reduction_op(reduction_op), + scan_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + /** + * Directly scatter flagged items to output offsets + */ + __device__ __forceinline__ void ScatterDirect( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD]) + { + // Scatter flagged keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (segment_flags[ITEM]) + { + d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; + d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; + } + } + } + + + /** + * 2-phase scatter flagged items to output offsets + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate: the scatter offsets must be decremented for value aggregates + */ + __device__ __forceinline__ void ScatterTwoPhase( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD], + OffsetT num_tile_segments, + OffsetT num_tile_segments_prefix) + { + CTA_SYNC(); + + // Compact and scatter pairs + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (segment_flags[ITEM]) + { + temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) + { + KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; + d_unique_out[num_tile_segments_prefix + item] = pair.key; + d_aggregates_out[num_tile_segments_prefix + item] = pair.value; + } + } + + + /** + * Scatter flagged items + */ + __device__ __forceinline__ void Scatter( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD], + OffsetT num_tile_segments, + OffsetT num_tile_segments_prefix) + { + // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one + if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) + { + ScatterTwoPhase( + scatter_items, + segment_flags, + segment_indices, + num_tile_segments, + num_tile_segments_prefix); + } + else + { + ScatterDirect( + scatter_items, + segment_flags, + segment_indices); + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys + KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up + ValueOutputT values[ITEMS_PER_THREAD]; // Tile values + OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags + OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices + OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices + KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering + + // Load keys + if (IS_LAST_TILE) + BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining); + else + BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + + // Load tile predecessor key in first thread + KeyOutputT tile_predecessor; + if (threadIdx.x == 0) + { + tile_predecessor = (tile_idx == 0) ? + keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head) + d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile + } + + CTA_SYNC(); + + // Load values + if (IS_LAST_TILE) + BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining); + else + BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); + + CTA_SYNC(); + + // Initialize head-flags and shuffle up the previous keys + if (IS_LAST_TILE) + { + // Use custom flag operator to additionally flag the first out-of-bounds item + GuardedInequalityWrapper flag_op(equality_op, num_remaining); + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( + head_flags, keys, prev_keys, flag_op, tile_predecessor); + } + else + { + InequalityWrapper flag_op(equality_op); + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( + head_flags, keys, prev_keys, flag_op, tile_predecessor); + } + + // Zip values and head flags + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + scan_items[ITEM].value = values[ITEM]; + scan_items[ITEM].key = head_flags[ITEM]; + } + + // Perform exclusive tile scan + OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate + OffsetT num_segments_prefix; // Number of segments prior to this tile + OffsetValuePairT total_aggregate; // The tile prefix folded with block_aggregate + if (tile_idx == 0) + { + // Scan first tile + BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); + num_segments_prefix = 0; + total_aggregate = block_aggregate; + + // Update tile status if there are successor tiles + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); + + block_aggregate = prefix_op.GetBlockAggregate(); + num_segments_prefix = prefix_op.GetExclusivePrefix().key; + total_aggregate = prefix_op.GetInclusivePrefix(); + } + + // Rezip scatter items and segment indices + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + scatter_items[ITEM].key = prev_keys[ITEM]; + scatter_items[ITEM].value = scan_items[ITEM].value; + segment_indices[ITEM] = scan_items[ITEM].key; + } + + // At this point, each flagged segment head has: + // - The key for the previous segment + // - The reduced value from the previous segment + // - The segment index for the reduced value + + // Scatter flagged keys and values + OffsetT num_tile_segments = block_aggregate.key; + Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); + + // Last thread in last tile will output final count (and last pair, if necessary) + if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) + { + OffsetT num_segments = num_segments_prefix + num_tile_segments; + + // If the last tile is a whole tile, output the final_value + if (num_remaining == TILE_ITEMS) + { + d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; + d_aggregates_out[num_segments] = total_aggregate.value; + num_segments++; + } + + // Output the total number of items selected + *d_num_runs_out = num_segments; + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_rle.cuh b/dnn/src/cuda/cub/agent/agent_rle.cuh new file mode 100644 index 00000000..cb7a4a65 --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_rle.cuh @@ -0,0 +1,837 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentRle + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentRlePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for data + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values + typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRle +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + typedef typename std::iterator_traits::value_type T; + + /// The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + /// Tuple type for scanning (pairs run-length and run-index) + typedef KeyValuePair LengthOffsetPair; + + /// Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Constants + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, + WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// Whether or not to sync after loading data + SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, + ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, + }; + + + /** + * Special operator that signals all out-of-bounds items are not equal to everything else, + * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked + * trivial. + */ + template + struct OobInequalityOp + { + OffsetT num_remaining; + EqualityOpT equality_op; + + __device__ __forceinline__ OobInequalityOp( + OffsetT num_remaining, + EqualityOpT equality_op) + : + num_remaining(num_remaining), + equality_op(equality_op) + {} + + template + __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx) + { + if (!LAST_TILE || (idx < num_remaining)) + return !equality_op(first, second); + else + return true; + } + }; + + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for data + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedVLengthnputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Parameterized BlockLoad type for data + typedef BlockLoad< + T, + AgentRlePolicyT::BLOCK_THREADS, + AgentRlePolicyT::ITEMS_PER_THREAD, + AgentRlePolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockDiscontinuity type for data + typedef BlockDiscontinuity BlockDiscontinuityT; + + // Parameterized WarpScan type + typedef WarpScan WarpScanPairs; + + // Reduce-length-by-run scan operator + typedef ReduceBySegmentOp ReduceBySegmentOpT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + LengthOffsetPair, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Warp exchange types + typedef WarpExchange WarpExchangePairs; + + typedef typename If::Type WarpExchangePairsStorage; + + typedef WarpExchange WarpExchangeOffsets; + typedef WarpExchange WarpExchangeLengths; + + typedef LengthOffsetPair WarpAggregates[WARPS]; + + // Shared memory type for this thread block + struct _TempStorage + { + // Aliasable storage layout + union Aliasable + { + struct + { + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans + Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + }; + + // Smem needed for input loading + typename BlockLoadT::TempStorage load; + + // Aliasable layout needed for two-phase scatter + union ScatterAliasable + { + unsigned long long align; + WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; + typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; + typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; + + } scatter_aliasable; + + } aliasable; + + OffsetT tile_idx; // Shared tile index + LengthOffsetPair tile_inclusive; // Inclusive tile prefix + LengthOffsetPair tile_exclusive; // Exclusive tile prefix + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + + WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets + LengthsOutputIteratorT d_lengths_out; ///< Output run lengths + + EqualityOpT equality_op; ///< T equality operator + ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator + OffsetT num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentRle( + TempStorage &temp_storage, ///< [in] Reference to temp_storage + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths + EqualityOpT equality_op, ///< [in] T equality operator + OffsetT num_items) ///< [in] Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_offsets_out(d_offsets_out), + d_lengths_out(d_lengths_out), + equality_op(equality_op), + scan_op(cub::Sum()), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_remaining, + T (&items)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) + { + bool head_flags[ITEMS_PER_THREAD]; + bool tail_flags[ITEMS_PER_THREAD]; + + OobInequalityOp inequality_op(num_remaining, equality_op); + + if (FIRST_TILE && LAST_TILE) + { + // First-and-last-tile always head-flags the first item and tail-flags the last item + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tail_flags, items, inequality_op); + } + else if (FIRST_TILE) + { + // First-tile always head-flags the first item + + // Get the first item from the next tile + T tile_successor_item; + if (threadIdx.x == BLOCK_THREADS - 1) + tile_successor_item = d_in[tile_offset + TILE_ITEMS]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tail_flags, tile_successor_item, items, inequality_op); + } + else if (LAST_TILE) + { + // Last-tile always flags the last item + + // Get the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[tile_offset - 1]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tile_predecessor_item, tail_flags, items, inequality_op); + } + else + { + // Get the first item from the next tile + T tile_successor_item; + if (threadIdx.x == BLOCK_THREADS - 1) + tile_successor_item = d_in[tile_offset + TILE_ITEMS]; + + // Get the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[tile_offset - 1]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); + } + + // Zip counts and runs + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); + lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); + } + } + + //--------------------------------------------------------------------- + // Scan utility methods + //--------------------------------------------------------------------- + + /** + * Scan of allocations + */ + __device__ __forceinline__ void WarpScanAllocations( + LengthOffsetPair &tile_aggregate, + LengthOffsetPair &warp_aggregate, + LengthOffsetPair &warp_exclusive_in_tile, + LengthOffsetPair &thread_exclusive_in_warp, + LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) + { + // Perform warpscans + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + LengthOffsetPair identity; + identity.key = 0; + identity.value = 0; + + LengthOffsetPair thread_inclusive; + LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); + WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan( + thread_aggregate, + thread_inclusive, + thread_exclusive_in_warp, + identity, + scan_op); + + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive; + + CTA_SYNC(); + + // Accumulate total selected and the warp-wide prefix + warp_exclusive_in_tile = identity; + warp_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[warp_id]; + tile_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[0]; + + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_exclusive_in_tile = tile_aggregate; + + tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]); + } + } + + + //--------------------------------------------------------------------- + // Utility methods for scattering selections + //--------------------------------------------------------------------- + + /** + * Two-phase scatter, specialized for warp time-slicing + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], + Int2Type is_warp_time_slice) + { + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + // Locally compact items within the warp (first warp) + if (warp_id == 0) + { + WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( + lengths_and_offsets, thread_num_runs_exclusive_in_warp); + } + + // Locally compact items within the warp (remaining warps) + #pragma unroll + for (int SLICE = 1; SLICE < WARPS; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( + lengths_and_offsets, thread_num_runs_exclusive_in_warp); + } + } + + // Global scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + (ITEM * WARP_THREADS) + lane_id; + + // Scatter offset + d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; + + // Scatter length if not the first (global) length + if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) + { + d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; + } + } + } + } + + + /** + * Two-phase scatter + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], + Int2Type is_warp_time_slice) + { + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + // Unzip + OffsetT run_offsets[ITEMS_PER_THREAD]; + LengthT run_lengths[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + run_offsets[ITEM] = lengths_and_offsets[ITEM].key; + run_lengths[ITEM] = lengths_and_offsets[ITEM].value; + } + + WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped( + run_offsets, thread_num_runs_exclusive_in_warp); + + WARP_SYNC(0xffffffff); + + WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped( + run_lengths, thread_num_runs_exclusive_in_warp); + + // Global scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + (ITEM * WARP_THREADS) + lane_id; + + // Scatter offset + d_offsets_out[item_offset] = run_offsets[ITEM]; + + // Scatter length if not the first (global) length + if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) + { + d_lengths_out[item_offset - 1] = run_lengths[ITEM]; + } + } + } + } + + + /** + * Direct scatter + */ + template + __device__ __forceinline__ void ScatterDirect( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + thread_num_runs_exclusive_in_warp[ITEM]; + + // Scatter offset + d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; + + // Scatter length if not the first (global) length + if (item_offset >= 1) + { + d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; + } + } + } + } + + + /** + * Scatter + */ + template + __device__ __forceinline__ void Scatter( + OffsetT tile_num_runs_aggregate, + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) + { + if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) + { + // Direct scatter if the warp has any items + if (warp_num_runs_aggregate) + { + ScatterDirect( + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + } + } + else + { + // Scatter two phase + ScatterTwoPhase( + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets, + Int2Type()); + } + } + + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template < + bool LAST_TILE> + __device__ __forceinline__ LengthOffsetPair ConsumeTile( + OffsetT num_items, ///< Total number of global input items + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT &tile_status) ///< Global list of tile status + { + if (tile_idx == 0) + { + // First tile + + // Load items + T items[ITEMS_PER_THREAD]; + if (LAST_TILE) + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); + else + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); + + if (SYNC_AFTER_LOAD) + CTA_SYNC(); + + // Set flags + LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; + + InitializeSelections( + tile_offset, + num_remaining, + items, + lengths_and_num_runs); + + // Exclusive scan of lengths and runs + LengthOffsetPair tile_aggregate; + LengthOffsetPair warp_aggregate; + LengthOffsetPair warp_exclusive_in_tile; + LengthOffsetPair thread_exclusive_in_warp; + + WarpScanAllocations( + tile_aggregate, + warp_aggregate, + warp_exclusive_in_tile, + thread_exclusive_in_warp, + lengths_and_num_runs); + + // Update tile status if this is not the last tile + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, tile_aggregate); + + // Update thread_exclusive_in_warp to fold in warp run-length + if (thread_exclusive_in_warp.key == 0) + thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; + + LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; + OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; + LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; + + // Downsweep scan through lengths_and_num_runs + internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); + + // Zip + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; + lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? + lengths_and_num_runs2[ITEM].key : // keep + WARP_THREADS * ITEMS_PER_THREAD; // discard + } + + OffsetT tile_num_runs_aggregate = tile_aggregate.key; + OffsetT tile_num_runs_exclusive_in_global = 0; + OffsetT warp_num_runs_aggregate = warp_aggregate.key; + OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; + + // Scatter + Scatter( + tile_num_runs_aggregate, + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + + // Return running total (inclusive of this tile) + return tile_aggregate; + } + else + { + // Not first tile + + // Load items + T items[ITEMS_PER_THREAD]; + if (LAST_TILE) + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); + else + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); + + if (SYNC_AFTER_LOAD) + CTA_SYNC(); + + // Set flags + LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; + + InitializeSelections( + tile_offset, + num_remaining, + items, + lengths_and_num_runs); + + // Exclusive scan of lengths and runs + LengthOffsetPair tile_aggregate; + LengthOffsetPair warp_aggregate; + LengthOffsetPair warp_exclusive_in_tile; + LengthOffsetPair thread_exclusive_in_warp; + + WarpScanAllocations( + tile_aggregate, + warp_aggregate, + warp_exclusive_in_tile, + thread_exclusive_in_warp, + lengths_and_num_runs); + + // First warp computes tile prefix in lane 0 + TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx); + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + if (warp_id == 0) + { + prefix_op(tile_aggregate); + if (threadIdx.x == 0) + temp_storage.tile_exclusive = prefix_op.exclusive_prefix; + } + + CTA_SYNC(); + + LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; + + // Update thread_exclusive_in_warp to fold in warp and tile run-lengths + LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); + if (thread_exclusive_in_warp.key == 0) + thread_exclusive_in_warp.value += thread_exclusive.value; + + // Downsweep scan through lengths_and_num_runs + LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; + LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; + OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; + + internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); + + // Zip + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; + lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? + lengths_and_num_runs2[ITEM].key : // keep + WARP_THREADS * ITEMS_PER_THREAD; // discard + } + + OffsetT tile_num_runs_aggregate = tile_aggregate.key; + OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; + OffsetT warp_num_runs_aggregate = warp_aggregate.key; + OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; + + // Scatter + Scatter( + tile_num_runs_aggregate, + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + + // Return running total (inclusive of this tile) + return prefix_op.inclusive_prefix; + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_status, ///< Global list of tile status + NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (tile_idx < num_tiles - 1) + { + // Not the last tile (full) + ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); + } + else if (num_remaining > 0) + { + // The last tile (possibly partially-full) + LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); + + if (threadIdx.x == 0) + { + // Output the total number of items selected + *d_num_runs_out = running_total.key; + + // The inclusive prefix contains accumulated length reduction for the last run + if (running_total.key > 0) + d_lengths_out[running_total.key - 1] = running_total.value; + } + } + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_scan.cuh b/dnn/src/cuda/cub/agent/agent_scan.cuh new file mode 100644 index 00000000..9368615e --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_scan.cuh @@ -0,0 +1,471 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentScan + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentScanPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ +template < + typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type + typename OutputIteratorT, ///< Random-access output iterator type + typename ScanOpT, ///< Scan functor type + typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) + typename OffsetT> ///< Signed integer type for global offsets +struct AgentScan +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Constants + enum + { + IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided + BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Parameterized BlockLoad type + typedef BlockLoad< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockStore type + typedef BlockStore< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::STORE_ALGORITHM> + BlockStoreT; + + // Parameterized BlockScan type + typedef BlockScan< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OutputT, + ScanOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles + typedef BlockScanRunningPrefixOp< + OutputT, + ScanOpT> + RunningPrefixCallbackOp; + + // Shared memory type for this thread block + union _TempStorage + { + typename BlockLoadT::TempStorage load; // Smem needed for tile loading + typename BlockStoreT::TempStorage store; // Smem needed for tile storing + + struct + { + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + }; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input data + OutputIteratorT d_out; ///< Output data + ScanOpT scan_op; ///< Binary scan operator + InitValueT init_value; ///< The init_value element for ScanOpT + + + //--------------------------------------------------------------------- + // Block scan utility methods + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + OutputT init_value, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); + block_aggregate = scan_op(init_value, block_aggregate); + } + + + /** + * Inclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + InitValueT /*init_value*/, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + } + + + /** + * Exclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); + } + + + /** + * Inclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentScan( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanOpT scan_op, ///< Binary scan operator + InitValueT init_value) ///< Initial value to seed the exclusive scan + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out(d_out), + scan_op(scan_op), + init_value(init_value) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Load items + OutputT items[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Perform tile scan + if (tile_idx == 0) + { + // Scan first tile + OutputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Store items + if (IS_LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining); + else + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } + + + //--------------------------------------------------------------------- + // Scan an sequence of consecutive tiles (independent of other thread blocks) + //--------------------------------------------------------------------- + + /** + * Process a tile of input + */ + template < + bool IS_FIRST_TILE, + bool IS_LAST_TILE> + __device__ __forceinline__ void ConsumeTile( + OffsetT tile_offset, ///< Tile offset + RunningPrefixCallbackOp& prefix_op, ///< Running prefix operator + int valid_items = TILE_ITEMS) ///< Number of valid items in the tile + { + // Load items + OutputT items[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Block scan + if (IS_FIRST_TILE) + { + OutputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + prefix_op.running_total = block_aggregate; + } + else + { + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Store items + if (IS_LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items); + else + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); + } + + + /** + * Scan a consecutive share of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT range_end) ///< [in] Threadblock end offset (exclusive) + { + BlockScanRunningPrefixOp prefix_op(scan_op); + + if (range_offset + TILE_ITEMS <= range_end) + { + // Consume first tile of input (full) + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + + // Consume subsequent full tiles of input + while (range_offset + TILE_ITEMS <= range_end) + { + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (range_offset < range_end) + { + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + else + { + // Consume the first tile of input (partially-full) + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + + + /** + * Scan a consecutive share of input tiles, seeded with the specified prefix value + */ + __device__ __forceinline__ void ConsumeRange( + OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT range_end, ///< [in] Threadblock end offset (exclusive) + OutputT prefix) ///< [in] The prefix to apply to the scan segment + { + BlockScanRunningPrefixOp prefix_op(prefix, scan_op); + + // Consume full tiles of input + while (range_offset + TILE_ITEMS <= range_end) + { + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (range_offset < range_end) + { + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_segment_fixup.cuh b/dnn/src/cuda/cub/agent/agent_segment_fixup.cuh new file mode 100644 index 00000000..e2de58ed --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_segment_fixup.cuh @@ -0,0 +1,375 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_discontinuity.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSegmentFixup + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSegmentFixupPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentSegmentFixup +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of key-value input iterator + typedef typename std::iterator_traits::value_type KeyValuePairT; + + // Value type + typedef typename KeyValuePairT::Value ValueT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Constants + enum + { + BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not do fixup using RLE + global atomics + USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) && + (Equals::VALUE || + Equals::VALUE || + Equals::VALUE || + Equals::VALUE), + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + PairsInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedPairsInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFixupInputIteratorT; + + // Reduce-value-by-segment scan operator + typedef ReduceByKeyOp ReduceBySegmentOpT; + + // Parameterized BlockLoad type for pairs + typedef BlockLoad< + KeyValuePairT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSegmentFixupPolicyT::LOAD_ALGORITHM> + BlockLoadPairs; + + // Parameterized BlockScan type + typedef BlockScan< + KeyValuePairT, + BLOCK_THREADS, + AgentSegmentFixupPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + KeyValuePairT, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + }; + + // Smem needed for loading keys + typename BlockLoadPairs::TempStorage load_pairs; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedPairsInputIteratorT d_pairs_in; ///< Input keys + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values + InequalityWrapper inequality_op; ///< KeyT inequality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentSegmentFixup( + TempStorage& temp_storage, ///< Reference to temp_storage + PairsInputIteratorT d_pairs_in, ///< Input keys + AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op) ///< ValueT reduction operator + : + temp_storage(temp_storage.Alias()), + d_pairs_in(d_pairs_in), + d_aggregates_out(d_aggregates_out), + d_fixup_in(d_aggregates_out), + inequality_op(equality_op), + reduction_op(reduction_op), + scan_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process input tile. Specialized for atomic-fixup + */ + template + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state, ///< Global tile state descriptor + Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + { + KeyValuePairT pairs[ITEMS_PER_THREAD]; + + // Load pairs + KeyValuePairT oob_pair; + oob_pair.key = -1; + + if (IS_LAST_TILE) + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); + else + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); + + // RLE + #pragma unroll + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; + if (pairs[ITEM].key != pairs[ITEM - 1].key) + atomicAdd(d_scatter, pairs[ITEM - 1].value); + else + pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); + } + + // Flush last item if valid + ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; + if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) + atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); + } + + + /** + * Process input tile. Specialized for reduce-by-key fixup + */ + template + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state, ///< Global tile state descriptor + Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + { + KeyValuePairT pairs[ITEMS_PER_THREAD]; + KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; + + // Load pairs + KeyValuePairT oob_pair; + oob_pair.key = -1; + + if (IS_LAST_TILE) + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); + else + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); + + CTA_SYNC(); + + KeyValuePairT tile_aggregate; + if (tile_idx == 0) + { + // Exclusive scan of values and segment_flags + BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); + + // Update tile status if this is not the last tile + if (threadIdx.x == 0) + { + // Set first segment id to not trigger a flush (invalid from exclusive scan) + scatter_pairs[0].key = pairs[0].key; + + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, tile_aggregate); + + } + } + else + { + // Exclusive scan of values and segment_flags + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); + tile_aggregate = prefix_op.GetBlockAggregate(); + } + + // Scatter updated values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scatter_pairs[ITEM].key != pairs[ITEM].key) + { + // Update the value at the key location + ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; + value = reduction_op(value, scatter_pairs[ITEM].value); + + d_aggregates_out[scatter_pairs[ITEM].key] = value; + } + } + + // Finalize the last item + if (IS_LAST_TILE) + { + // Last thread will output final count and last item, if necessary + if (threadIdx.x == BLOCK_THREADS - 1) + { + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + // Update the value at the key location + OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; + d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); + } + } + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not the last tile (full) + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); + } + else if (num_remaining > 0) + { + // The last tile (possibly partially-full) + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_select_if.cuh b/dnn/src/cuda/cub/agent/agent_select_if.cuh new file mode 100644 index 00000000..52ca9fc2 --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_select_if.cuh @@ -0,0 +1,703 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSelectIf + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSelectIfPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + +/** + * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for selection items + typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items + typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct AgentSelectIf +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Constants + enum + { + USE_SELECT_OP, + USE_SELECT_FLAGS, + USE_DISCONTINUITY, + + BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), + + SELECT_METHOD = (!Equals::VALUE) ? + USE_SELECT_OP : + (!Equals::VALUE) ? + USE_SELECT_FLAGS : + USE_DISCONTINUITY + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for items + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + FlagsInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFlagsInputIteratorT; + + // Parameterized BlockLoad type for input data + typedef BlockLoad< + OutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSelectIfPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockLoad type for flags + typedef BlockLoad< + FlagT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSelectIfPolicyT::LOAD_ALGORITHM> + BlockLoadFlags; + + // Parameterized BlockDiscontinuity type for items + typedef BlockDiscontinuity< + OutputT, + BLOCK_THREADS> + BlockDiscontinuityT; + + // Parameterized BlockScan type + typedef BlockScan< + OffsetT, + BLOCK_THREADS, + AgentSelectIfPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OffsetT, + cub::Sum, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Item exchange type + typedef OutputT ItemExchangeT[TILE_ITEMS]; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for loading items + typename BlockLoadT::TempStorage load_items; + + // Smem needed for loading values + typename BlockLoadFlags::TempStorage load_flags; + + // Smem needed for compacting items (allows non POD items in this union) + Uninitialized raw_exchange; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input items + SelectedOutputIteratorT d_selected_out; ///< Unique output items + WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) + InequalityWrapper inequality_op; ///< T inequality operator + SelectOpT select_op; ///< Selection operator + OffsetT num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentSelectIf( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data + FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< Output data + SelectOpT select_op, ///< Selection operator + EqualityOpT equality_op, ///< Equality operator + OffsetT num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_flags_in(d_flags_in), + d_selected_out(d_selected_out), + select_op(select_op), + inequality_op(equality_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + /** + * Initialize selections (specialized for selection operator) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT /*tile_offset*/, + OffsetT num_tile_items, + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Out-of-bounds items are selection_flags + selection_flags[ITEM] = 1; + + if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) + selection_flags[ITEM] = select_op(items[ITEM]); + } + } + + + /** + * Initialize selections (specialized for valid flags) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_tile_items, + OutputT (&/*items*/)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + CTA_SYNC(); + + FlagT flags[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Out-of-bounds items are selection_flags + BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); + } + else + { + BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); + } + + // Convert flag type to selection_flags type + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + selection_flags[ITEM] = flags[ITEM]; + } + } + + + /** + * Initialize selections (specialized for discontinuity detection) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_tile_items, + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + if (IS_FIRST_TILE) + { + CTA_SYNC(); + + // Set head selection_flags. First tile sets the first flag for the first item + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); + } + else + { + OutputT tile_predecessor; + if (threadIdx.x == 0) + tile_predecessor = d_in[tile_offset - 1]; + + CTA_SYNC(); + + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor); + } + + // Set selection flags for out-of-bounds items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + } + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + /** + * Scatter flagged items to output offsets (specialized for direct scattering) + */ + template + __device__ __forceinline__ void ScatterDirect( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + OffsetT num_selections) + { + // Scatter flagged items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selection_flags[ITEM]) + { + if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) + { + d_selected_out[selection_indices[ITEM]] = items[ITEM]; + } + } + } + } + + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int /*num_tile_items*/, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile + Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + { + CTA_SYNC(); + + // Compact and scatter items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; + if (selection_flags[ITEM]) + { + temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) + { + d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; + } + } + + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile + Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + { + CTA_SYNC(); + + int tile_num_rejections = num_tile_items - num_tile_selections; + + // Scatter items to shared memory (rejections first) + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; + int local_rejection_idx = item_idx - local_selection_idx; + int local_scatter_offset = (selection_flags[ITEM]) ? + tile_num_rejections + local_selection_idx : + local_rejection_idx; + + temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; + } + + CTA_SYNC(); + + // Gather items from shared memory and scatter to global + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + int rejection_idx = item_idx; + int selection_idx = item_idx - tile_num_rejections; + OffsetT scatter_offset = (item_idx < tile_num_rejections) ? + num_items - num_rejected_prefix - rejection_idx - 1 : + num_selections_prefix + selection_idx; + + OutputT item = temp_storage.raw_exchange.Alias()[item_idx]; + + if (!IS_LAST_TILE || (item_idx < num_tile_items)) + { + d_selected_out[scatter_offset] = item; + } + } + } + + + /** + * Scatter flagged items + */ + template + __device__ __forceinline__ void Scatter( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile + OffsetT num_selections) ///< Total number of selections including this tile + { + // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one + if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) + { + ScatterTwoPhase( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_rejected_prefix, + Int2Type()); + } + else + { + ScatterDirect( + items, + selection_flags, + selection_indices, + num_selections); + } + } + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeFirstTile( + int num_tile_items, ///< Number of input items comprising this tile + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OutputT items[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_indices[ITEMS_PER_THREAD]; + + // Load items + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + else + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + + // Initialize selection_flags + InitializeSelections( + tile_offset, + num_tile_items, + items, + selection_flags, + Int2Type()); + + CTA_SYNC(); + + // Exclusive scan of selection_flags + OffsetT num_tile_selections; + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); + + if (threadIdx.x == 0) + { + // Update tile status if this is not the last tile + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, num_tile_selections); + } + + // Discount any out-of-bounds selections + if (IS_LAST_TILE) + num_tile_selections -= (TILE_ITEMS - num_tile_items); + + // Scatter flagged items + Scatter( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + 0, + 0, + num_tile_selections); + + return num_tile_selections; + } + + + /** + * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeSubsequentTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OutputT items[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_indices[ITEMS_PER_THREAD]; + + // Load items + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + else + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + + // Initialize selection_flags + InitializeSelections( + tile_offset, + num_tile_items, + items, + selection_flags, + Int2Type()); + + CTA_SYNC(); + + // Exclusive scan of values and selection_flags + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx); + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); + + OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); + OffsetT num_selections = prefix_op.GetInclusivePrefix(); + OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); + OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix; + + // Discount any out-of-bounds selections + if (IS_LAST_TILE) + { + int num_discount = TILE_ITEMS - num_tile_items; + num_selections -= num_discount; + num_tile_selections -= num_discount; + } + + // Scatter flagged items + Scatter( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_rejected_prefix, + num_selections); + + return num_selections; + } + + + /** + * Process a tile of input + */ + template + __device__ __forceinline__ OffsetT ConsumeTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OffsetT num_selections; + if (tile_idx == 0) + { + num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); + } + else + { + num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); + } + + return num_selections; + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selection_flags + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state, ///< Global tile state descriptor + NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + + if (tile_idx < num_tiles - 1) + { + // Not the last tile (full) + ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); + } + else + { + // The last tile (possibly partially-full) + OffsetT num_remaining = num_items - tile_offset; + OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + + if (threadIdx.x == 0) + { + // Output the total number of items selection_flags + *d_num_selected_out = num_selections; + } + } + } + +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/agent_spmv_orig.cuh b/dnn/src/cuda/cub/agent/agent_spmv_orig.cuh new file mode 100644 index 00000000..54e2a139 --- /dev/null +++ b/dnn/src/cuda/cub/agent/agent_spmv_orig.cuh @@ -0,0 +1,670 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + */ + +#pragma once + +#include + +#include "../util_type.cuh" +#include "../block/block_reduce.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../thread/thread_search.cuh" +#include "../thread/thread_operators.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/counting_input_iterator.cuh" +#include "../iterator/tex_ref_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSpmv + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search + CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets + CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices + CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values + CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values + bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSpmvPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) + }; + + static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets + static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets + static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices + static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values + static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +struct SpmvParams +{ + ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values + OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows; ///< Number of rows of matrix A. + int num_cols; ///< Number of columns of matrix A. + int num_nonzeros; ///< Number of nonzero elements of matrix A. + ValueT alpha; ///< Alpha multiplicand + ValueT beta; ///< Beta addend-multiplicand + + TexRefInputIterator t_vector_x; +}; + + +/** + * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 + bool HAS_BETA, ///< Whether the input parameter \p beta is 0 + int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability +struct AgentSpmv +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + /// 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + /// Input iterator wrapper types (for applying cache modifiers) + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, + OffsetT, + OffsetT> + ColumnIndicesIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + ValueIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + // Reduce-value-by-segment scan operator + typedef ReduceByKeyOp ReduceBySegmentOpT; + + // BlockReduce specialization + typedef BlockReduce< + ValueT, + BLOCK_THREADS, + BLOCK_REDUCE_WARP_REDUCTIONS> + BlockReduceT; + + // BlockScan specialization + typedef BlockScan< + KeyValuePairT, + BLOCK_THREADS, + AgentSpmvPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // BlockScan specialization + typedef BlockScan< + ValueT, + BLOCK_THREADS, + AgentSpmvPolicyT::SCAN_ALGORITHM> + BlockPrefixSumT; + + // BlockExchange specialization + typedef BlockExchange< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeT; + + /// Merge item type (either a non-zero value or a row-end offset) + union MergeItem + { + // Value type to pair with index type OffsetT (NullType if loading values directly during merge) + typedef typename If::Type MergeValueT; + + OffsetT row_end_offset; + MergeValueT nonzero; + }; + + /// Shared memory type required by this thread block + struct _TempStorage + { + CoordinateT tile_coords[2]; + + union Aliasable + { + // Smem needed for tile of merge items + MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; + + // Smem needed for block exchange + typename BlockExchangeT::TempStorage exchange; + + // Smem needed for block-wide reduction + typename BlockReduceT::TempStorage reduce; + + // Smem needed for tile scanning + typename BlockScanT::TempStorage scan; + + // Smem needed for tile prefix sum + typename BlockPrefixSumT::TempStorage prefix_sum; + + } aliasable; + }; + + /// Temporary storage type (unionable) + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + + _TempStorage& temp_storage; /// Reference to temp_storage + + SpmvParams& spmv_params; + + ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values + ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentSpmv( + TempStorage& temp_storage, ///< Reference to temp_storage + SpmvParams& spmv_params) ///< SpMV input parameter bundle + : + temp_storage(temp_storage.Alias()), + spmv_params(spmv_params), + wd_values(spmv_params.d_values), + wd_row_end_offsets(spmv_params.d_row_end_offsets), + wd_column_indices(spmv_params.d_column_indices), + wd_vector_x(spmv_params.d_vector_x), + wd_vector_y(spmv_params.d_vector_y) + {} + + + + + /** + * Consume a merge tile, specialized for direct-load of nonzeros + */ + __device__ __forceinline__ KeyValuePairT ConsumeTile( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + + // Gather the row end-offsets for the merge tile into shared memory + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for the thread's starting coordinate within the merge tile + CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + CoordinateT thread_start_coord; + + MergePathSearch( + OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal + s_tile_row_end_offsets, // List A + tile_nonzero_indices, // List B + tile_num_rows, + tile_num_nonzeros, + thread_start_coord); + + CTA_SYNC(); // Perf-sync + + // Compute the thread's merge path segment + CoordinateT thread_current_coord = thread_start_coord; + KeyValuePairT scan_segment[ITEMS_PER_THREAD]; + + ValueT running_total = 0.0; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); + OffsetT column_idx = wd_column_indices[nonzero_idx]; + ValueT value = wd_values[nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + ValueT nonzero = value * vector_value; + + OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + + if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) + { + // Move down (accumulate) + running_total += nonzero; + scan_segment[ITEM].value = running_total; + scan_segment[ITEM].key = tile_num_rows; + ++thread_current_coord.y; + } + else + { + // Move right (reset) + scan_segment[ITEM].value = running_total; + scan_segment[ITEM].key = thread_current_coord.x; + running_total = 0.0; + ++thread_current_coord.x; + } + } + + CTA_SYNC(); + + // Block-wide reduce-value-by-segment + KeyValuePairT tile_carry; + ReduceBySegmentOpT scan_op; + KeyValuePairT scan_item; + + scan_item.value = running_total; + scan_item.key = thread_current_coord.x; + + BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); + + if (tile_num_rows > 0) + { + if (threadIdx.x == 0) + scan_item.key = -1; + + // Direct scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scan_segment[ITEM].key < tile_num_rows) + { + if (scan_item.key == scan_segment[ITEM].key) + scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; + + if (HAS_ALPHA) + { + scan_segment[ITEM].value *= spmv_params.alpha; + } + + if (HAS_BETA) + { + // Update the output vector element + ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; + scan_segment[ITEM].value += addend; + } + + // Set the output vector element + spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; + } + } + } + + // Return the tile's running carry-out + return tile_carry; + } + + + + /** + * Consume a merge tile, specialized for indirect load of nonzeros + */ + __device__ __forceinline__ KeyValuePairT ConsumeTile( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + +#if (CUB_PTX_ARCH >= 520) + + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + + // Gather the nonzeros for the merge tile into shared memory + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + + ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; + ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; + ValueT* s = s_tile_nonzeros + nonzero_idx; + + if (nonzero_idx < tile_num_nonzeros) + { + + OffsetT column_idx = *ci; + ValueT value = *a; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; + vector_value = wd_vector_x[column_idx]; + + ValueT nonzero = value * vector_value; + + *s = nonzero; + } + } + + +#else + + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + + // Gather the nonzeros for the merge tile into shared memory + if (tile_num_nonzeros > 0) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); + + OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; + ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + ValueT nonzero = value * vector_value; + + s_tile_nonzeros[nonzero_idx] = nonzero; + } + } + +#endif + + // Gather the row end-offsets for the merge tile into shared memory + #pragma unroll 1 + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for the thread's starting coordinate within the merge tile + CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + CoordinateT thread_start_coord; + + MergePathSearch( + OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal + s_tile_row_end_offsets, // List A + tile_nonzero_indices, // List B + tile_num_rows, + tile_num_nonzeros, + thread_start_coord); + + CTA_SYNC(); // Perf-sync + + // Compute the thread's merge path segment + CoordinateT thread_current_coord = thread_start_coord; + KeyValuePairT scan_segment[ITEMS_PER_THREAD]; + ValueT running_total = 0.0; + + OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) + { + // Move down (accumulate) + scan_segment[ITEM].value = nonzero; + running_total += nonzero; + ++thread_current_coord.y; + nonzero = s_tile_nonzeros[thread_current_coord.y]; + } + else + { + // Move right (reset) + scan_segment[ITEM].value = 0.0; + running_total = 0.0; + ++thread_current_coord.x; + row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + } + + scan_segment[ITEM].key = thread_current_coord.x; + } + + CTA_SYNC(); + + // Block-wide reduce-value-by-segment + KeyValuePairT tile_carry; + ReduceBySegmentOpT scan_op; + KeyValuePairT scan_item; + + scan_item.value = running_total; + scan_item.key = thread_current_coord.x; + + BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); + + if (threadIdx.x == 0) + { + scan_item.key = thread_start_coord.x; + scan_item.value = 0.0; + } + + if (tile_num_rows > 0) + { + + CTA_SYNC(); + + // Scan downsweep and scatter + ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; + + if (scan_item.key != scan_segment[0].key) + { + s_partials[scan_item.key] = scan_item.value; + } + else + { + scan_segment[0].value += scan_item.value; + } + + #pragma unroll + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) + { + s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; + } + else + { + scan_segment[ITEM].value += scan_segment[ITEM - 1].value; + } + } + + CTA_SYNC(); + + #pragma unroll 1 + for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) + { + spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; + } + } + + // Return the tile's running carry-out + return tile_carry; + } + + + /** + * Consume input tile + */ + __device__ __forceinline__ void ConsumeTile( + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_merge_tiles) ///< [in] Number of merge tiles + { + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + + if (tile_idx >= num_merge_tiles) + return; + + // Read our starting coordinates + if (threadIdx.x < 2) + { + if (d_tile_coordinates == NULL) + { + // Search our starting coordinates + OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; + CoordinateT tile_coord; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coord); + + temp_storage.tile_coords[threadIdx.x] = tile_coord; + } + else + { + temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; + } + } + + CTA_SYNC(); + + CoordinateT tile_start_coord = temp_storage.tile_coords[0]; + CoordinateT tile_end_coord = temp_storage.tile_coords[1]; + + // Consume multi-segment tile + KeyValuePairT tile_carry = ConsumeTile( + tile_idx, + tile_start_coord, + tile_end_coord, + Int2Type()); + + // Output the tile's carry-out + if (threadIdx.x == 0) + { + if (HAS_ALPHA) + tile_carry.value *= spmv_params.alpha; + + tile_carry.key += tile_start_coord.x; + d_tile_carry_pairs[tile_idx] = tile_carry; + } + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/agent/single_pass_scan_operators.cuh b/dnn/src/cuda/cub/agent/single_pass_scan_operators.cuh new file mode 100644 index 00000000..7cee1b79 --- /dev/null +++ b/dnn/src/cuda/cub/agent/single_pass_scan_operators.cuh @@ -0,0 +1,815 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Callback operator types for supplying BlockScan prefixes + */ + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../warp/warp_reduce.cuh" +#include "../util_arch.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Prefix functor type for maintaining a running prefix while scanning a + * region independent of other thread blocks + ******************************************************************************/ + +/** + * Stateful callback operator type for supplying BlockScan prefixes. + * Maintains a running prefix that can be applied to consecutive + * BlockScan operations. + */ +template < + typename T, ///< BlockScan value type + typename ScanOpT> ///< Wrapped scan operator type +struct BlockScanRunningPrefixOp +{ + ScanOpT op; ///< Wrapped scan operator + T running_total; ///< Running block-wide prefix + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) + : + op(op) + {} + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp( + T starting_prefix, + ScanOpT op) + : + op(op), + running_total(starting_prefix) + {} + + /** + * Prefix callback operator. Returns the block-wide running_total in thread-0. + */ + __device__ __forceinline__ T operator()( + const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs + { + T retval = running_total; + running_total = op(running_total, block_aggregate); + return retval; + } +}; + + +/****************************************************************************** + * Generic tile status interface types for block-cooperative scans + ******************************************************************************/ + +/** + * Enumerations of tile status + */ +enum ScanTileStatus +{ + SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) + SCAN_TILE_INVALID = 99, // Not yet processed + SCAN_TILE_PARTIAL, // Tile aggregate is available + SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available +}; + + +/** + * Tile status interface. + */ +template < + typename T, + bool SINGLE_WORD = Traits::PRIMITIVE> +struct ScanTileState; + + +/** + * Tile status interface specialized for scan status and value types + * that can be combined into one machine word that can be + * read/written coherently in a single access. + */ +template +struct ScanTileState +{ + // Status word type + typedef typename If<(sizeof(T) == 8), + long long, + typename If<(sizeof(T) == 4), + int, + typename If<(sizeof(T) == 2), + short, + char>::Type>::Type>::Type StatusWord; + + + // Unit word type + typedef typename If<(sizeof(T) == 8), + longlong2, + typename If<(sizeof(T) == 4), + int2, + typename If<(sizeof(T) == 2), + int, + uchar2>::Type>::Type>::Type TxnWord; + + + // Device word type + struct TileDescriptor + { + StatusWord status; + T value; + }; + + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + + // Device storage + TxnWord *d_tile_descriptors; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_descriptors(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int /*num_tiles*/, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_descriptors = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + TxnWord val = TxnWord(); + TileDescriptor *descriptor = reinterpret_cast(&val); + + if (tile_idx < num_tiles) + { + // Not-yet-set + descriptor->status = StatusWord(SCAN_TILE_INVALID); + d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + descriptor->status = StatusWord(SCAN_TILE_OOB); + d_tile_descriptors[threadIdx.x] = val; + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + TileDescriptor tile_descriptor; + do + { + __threadfence_block(); // prevent hoisting loads from loop + TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + + } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + + status = tile_descriptor.status; + value = tile_descriptor.value; + } + +}; + + + +/** + * Tile status interface specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template +struct ScanTileState +{ + // Status word type + typedef char StatusWord; + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Device storage + StatusWord *d_tile_status; + T *d_tile_partial; + T *d_tile_inclusive; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL), + d_tile_partial(NULL), + d_tile_inclusive(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + cudaError_t error = cudaSuccess; + do + { + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + void* allocations[3]; + size_t allocation_sizes[3]; + + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Compute allocation pointers into the single storage blob + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Alias the offsets + d_tile_status = reinterpret_cast(allocations[0]); + d_tile_partial = reinterpret_cast(allocations[1]); + d_tile_inclusive = reinterpret_cast(allocations[2]); + } + while (0); + + return error; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + // Specify storage allocation requirements + size_t allocation_sizes[3]; + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Set the necessary size of the blob + void* allocations[3]; + return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + // Update tile inclusive value + ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + // Update tile partial value + ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + do { + status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); + + __threadfence(); // prevent hoisting loads from loop or loads below above this one + + } while (status == SCAN_TILE_INVALID); + + if (status == StatusWord(SCAN_TILE_PARTIAL)) + value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + else + value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + } +}; + + +/****************************************************************************** + * ReduceByKey tile status interface types for block-cooperative scans + ******************************************************************************/ + +/** + * Tile status interface for reduction by key. + * + */ +template < + typename ValueT, + typename KeyT, + bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> +struct ReduceByKeyScanTileState; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template < + typename ValueT, + typename KeyT> +struct ReduceByKeyScanTileState : + ScanTileState > +{ + typedef ScanTileState > SuperClass; + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() : SuperClass() {} +}; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * can be combined into one machine word that can be read/written coherently in a single access. + */ +template < + typename ValueT, + typename KeyT> +struct ReduceByKeyScanTileState +{ + typedef KeyValuePairKeyValuePairT; + + // Constants + enum + { + PAIR_SIZE = sizeof(ValueT) + sizeof(KeyT), + TXN_WORD_SIZE = 1 << Log2::VALUE, + STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, + + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Status word type + typedef typename If<(STATUS_WORD_SIZE == 8), + long long, + typename If<(STATUS_WORD_SIZE == 4), + int, + typename If<(STATUS_WORD_SIZE == 2), + short, + char>::Type>::Type>::Type StatusWord; + + // Status word type + typedef typename If<(TXN_WORD_SIZE == 16), + longlong2, + typename If<(TXN_WORD_SIZE == 8), + long long, + int>::Type>::Type TxnWord; + + // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) + struct TileDescriptorBigStatus + { + KeyT key; + ValueT value; + StatusWord status; + }; + + // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) + struct TileDescriptorLittleStatus + { + ValueT value; + StatusWord status; + KeyT key; + }; + + // Device word type + typedef typename If< + (sizeof(ValueT) == sizeof(KeyT)), + TileDescriptorBigStatus, + TileDescriptorLittleStatus>::Type + TileDescriptor; + + + // Device storage + TxnWord *d_tile_descriptors; + + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() + : + d_tile_descriptors(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int /*num_tiles*/, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_descriptors = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + TxnWord val = TxnWord(); + TileDescriptor *descriptor = reinterpret_cast(&val); + + if (tile_idx < num_tiles) + { + // Not-yet-set + descriptor->status = StatusWord(SCAN_TILE_INVALID); + d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + descriptor->status = StatusWord(SCAN_TILE_OOB); + d_tile_descriptors[threadIdx.x] = val; + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive.value; + tile_descriptor.key = tile_inclusive.key; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial.value; + tile_descriptor.key = tile_partial.key; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + KeyValuePairT &value) + { +// TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); +// TileDescriptor tile_descriptor = reinterpret_cast(alias); +// +// while (tile_descriptor.status == SCAN_TILE_INVALID) +// { +// __threadfence_block(); // prevent hoisting loads from loop +// +// alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); +// tile_descriptor = reinterpret_cast(alias); +// } +// +// status = tile_descriptor.status; +// value.value = tile_descriptor.value; +// value.key = tile_descriptor.key; + + TileDescriptor tile_descriptor; + do + { + __threadfence_block(); // prevent hoisting loads from loop + TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + + } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + + status = tile_descriptor.status; + value.value = tile_descriptor.value; + value.key = tile_descriptor.key; + } + +}; + + +/****************************************************************************** + * Prefix call-back operator for coupling local block scan within a + * block-cooperative scan + ******************************************************************************/ + +/** + * Stateful block-scan prefix functor. Provides the the running prefix for + * the current tile by using the call-back warp to wait on on + * aggregates/prefixes from predecessor tiles to become available. + */ +template < + typename T, + typename ScanOpT, + typename ScanTileStateT, + int PTX_ARCH = CUB_PTX_ARCH> +struct TilePrefixCallbackOp +{ + // Parameterized warp reduce + typedef WarpReduce WarpReduceT; + + // Temporary storage type + struct _TempStorage + { + typename WarpReduceT::TempStorage warp_reduce; + T exclusive_prefix; + T inclusive_prefix; + T block_aggregate; + }; + + // Alias wrapper allowing temporary storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + // Type of status word + typedef typename ScanTileStateT::StatusWord StatusWord; + + // Fields + _TempStorage& temp_storage; ///< Reference to a warp-reduction instance + ScanTileStateT& tile_status; ///< Interface to tile status + ScanOpT scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile + + // Constructor + __device__ __forceinline__ + TilePrefixCallbackOp( + ScanTileStateT &tile_status, + TempStorage &temp_storage, + ScanOpT scan_op, + int tile_idx) + : + temp_storage(temp_storage.Alias()), + tile_status(tile_status), + scan_op(scan_op), + tile_idx(tile_idx) {} + + + // Block until all predecessors within the warp-wide window have non-invalid status + __device__ __forceinline__ + void ProcessWindow( + int predecessor_idx, ///< Preceding tile index to inspect + StatusWord &predecessor_status, ///< [out] Preceding tile status + T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles + { + T value; + tile_status.WaitForValid(predecessor_idx, predecessor_status, value); + + // Perform a segmented reduction to get the prefix for the current window. + // Use the swizzled scan operator because we are now scanning *down* towards thread0. + + int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); + window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( + value, + tail_flag, + SwizzleScanOp(scan_op)); + } + + + // BlockScan prefix callback functor (called by the first warp) + __device__ __forceinline__ + T operator()(T block_aggregate) + { + + // Update our status with our tile-aggregate + if (threadIdx.x == 0) + { + temp_storage.block_aggregate = block_aggregate; + tile_status.SetPartial(tile_idx, block_aggregate); + } + + int predecessor_idx = tile_idx - threadIdx.x - 1; + StatusWord predecessor_status; + T window_aggregate; + + // Wait for the warp-wide window of predecessor tiles to become valid + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + + // The exclusive tile prefix starts out as the current window aggregate + exclusive_prefix = window_aggregate; + + // Keep sliding the window back until we come across a tile whose inclusive prefix is known + while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) + { + predecessor_idx -= CUB_PTX_WARP_THREADS; + + // Update exclusive tile prefix with the window prefix + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); + } + + // Compute the inclusive tile prefix and update the status for this tile + if (threadIdx.x == 0) + { + inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); + tile_status.SetInclusive(tile_idx, inclusive_prefix); + + temp_storage.exclusive_prefix = exclusive_prefix; + temp_storage.inclusive_prefix = inclusive_prefix; + } + + // Return exclusive_prefix + return exclusive_prefix; + } + + // Get the exclusive prefix stored in temporary storage + __device__ __forceinline__ + T GetExclusivePrefix() + { + return temp_storage.exclusive_prefix; + } + + // Get the inclusive prefix stored in temporary storage + __device__ __forceinline__ + T GetInclusivePrefix() + { + return temp_storage.inclusive_prefix; + } + + // Get the block aggregate stored in temporary storage + __device__ __forceinline__ + T GetBlockAggregate() + { + return temp_storage.block_aggregate; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/block/block_adjacent_difference.cuh b/dnn/src/cuda/cub/block/block_adjacent_difference.cuh new file mode 100644 index 00000000..acef9f05 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_adjacent_difference.cuh @@ -0,0 +1,596 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockAdjacentDifference +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T first_items[BLOCK_THREADS]; + T last_items[BLOCK_THREADS]; + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(b, a, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) + { + return flag_op(b, a); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + preds[ITERATION] = input[ITERATION - 1]; + + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + preds[ITERATION], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); + } + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + input[ITERATION], + input[ITERATION + 1], + (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); + + Iterate::FlagTails(linear_tid, flags, input, flag_op); + } + + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockAdjacentDifference() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockAdjacentDifference( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if (linear_tid == 0) + { + // Set flag for first thread-item (preds[0] is undefined) + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + } + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); + } + + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = temp_storage.last_items[linear_tid - 1]; + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/block/block_discontinuity.cuh b/dnn/src/cuda/cub/block/block_discontinuity.cuh new file mode 100644 index 00000000..503e3e0b --- /dev/null +++ b/dnn/src/cuda/cub/block/block_discontinuity.cuh @@ -0,0 +1,1148 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be flagged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items + * that differ from their predecessors (or successors). For example, head flags are convenient + * for demarcating disjoint data segments as part of a segmented scan or reduction. + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockDiscontinuity} + * \par + * The code snippet below illustrates the head flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \par Performance Considerations + * - Incurs zero bank conflicts for most types + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockDiscontinuity +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T first_items[BLOCK_THREADS]; + T last_items[BLOCK_THREADS]; + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(a, b, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) + { + return flag_op(a, b); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + preds[ITERATION] = input[ITERATION - 1]; + + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + preds[ITERATION], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); + } + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + input[ITERATION], + input[ITERATION + 1], + (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); + + Iterate::FlagTails(linear_tid, flags, input, flag_op); + } + + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if (linear_tid == 0) + { + // Set flag for first thread-item (preds[0] is undefined) + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + } + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op); + } + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads( + * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, + * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be + * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); + } + + + + //@} end member group + /******************************************************************//** + * \name Tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. + * The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_successor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * tail_flags, thread_data, cub::Inequality(), tile_successor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + //@} end member group + /******************************************************************//** + * \name Head & tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = temp_storage.last_items[linear_tid - 1]; + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, + * thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, + * that the \p tile_predecessor_item is \p 0, and that the + * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_successor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, + * thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, + * that the \p tile_predecessor_item is \p 0, and that the + * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/block/block_exchange.cuh b/dnn/src/cuda/cub/block/block_exchange.cuh new file mode 100644 index 00000000..3ae99343 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_exchange.cuh @@ -0,0 +1,1248 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. + * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - It is commonplace for blocks of threads to rearrange data items between + * threads. For example, the device-accessible memory subsystem prefers access patterns + * where data items are "striped" across threads (where consecutive threads access consecutive items), + * yet most block-wide operations prefer a "blocked" partitioning of items across threads + * (where consecutive items belong to a single thread). + * - BlockExchange supports the following types of data exchanges: + * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements + * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements + * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) + * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockExchange} + * \par + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of data striped across threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + * \par Performance Considerations + * - Proper device-specific padding ensures zero bank conflicts for most types. + * + */ +template < + typename InputT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + bool WARP_TIME_SLICING = false, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, + + TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, + TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) + INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), + PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + struct __align__(16) _TempStorage + { + InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; + }; + +public: + + /// \smemstorage{BlockExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + unsigned int lane_id; + unsigned int warp_id; + unsigned int warp_offset; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + if (warp_id == 0) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + #pragma unroll + for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + // No timeslicing + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + // Warp time-slicing + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Write a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + } + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + CTA_SYNC(); + + const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockExchange() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockExchange( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + lane_id(LaneId()), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + //@} end member group + /******************************************************************//** + * \name Structured exchanges + *********************************************************************/ + //@{ + + /** + * \brief Transposes data items from striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a striped arrangement across block threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(input_items, output_items, Int2Type()); + } + + + /** + * \brief Transposes data items from blocked arrangement to striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a striped arrangement across threads + * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); + * + * // Store data striped across block threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in + * preparation for storing to device-accessible memory. + * + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + BlockedToStriped(input_items, output_items, Int2Type()); + } + + + + /** + * \brief Transposes data items from warp-striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a warp-striped arrangement across warp threads + * int thread_data[4]; + * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of warp-striped input \p thread_data across the block of threads is + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * after loading from device-accessible memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + WarpStripedToBlocked(input_items, output_items, Int2Type()); + } + + + + /** + * \brief Transposes data items from blocked arrangement to warp-striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a warp-striped arrangement across threads + * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); + * + * // Store data striped across warp threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * in preparation for storing to device-accessible memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + BlockedToWarpStriped(input_items, output_items, Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Scatter exchanges + *********************************************************************/ + //@{ + + + /** + * \brief Exchanges data items annotated by rank into blocked arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(input_items, output_items, ranks, Int2Type()); + } + + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(input_items, output_items, ranks, Int2Type()); + } + + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStripedGuarded( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (ranks[ITEM] >= 0) + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + * \tparam ValidFlag [inferred] FlagT type denoting which items are valid + */ + template + __device__ __forceinline__ void ScatterToStripedFlagged( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (is_valid[ITEM]) + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + //@} end member group + + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + + __device__ __forceinline__ void StripedToBlocked( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(items, items); + } + + __device__ __forceinline__ void BlockedToStriped( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + BlockedToStriped(items, items); + } + + __device__ __forceinline__ void WarpStripedToBlocked( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + WarpStripedToBlocked(items, items); + } + + __device__ __forceinline__ void BlockedToWarpStriped( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + BlockedToWarpStriped(items, items); + } + + template + __device__ __forceinline__ void ScatterToBlocked( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStriped( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStripedGuarded( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStripedGuarded(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStripedFlagged( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + ScatterToStriped(items, items, ranks, is_valid); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +template < + typename T, + int ITEMS_PER_THREAD, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + // Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) + INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), + PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + struct _TempStorage + { + T buff[WARP_ITEMS + PADDING_ITEMS]; + }; + +public: + + /// \smemstorage{WarpExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + int lane_id; + +public: + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpExchange( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); + temp_storage.buff[ranks[ITEM]] = items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage.buff[item_offset]; + } + } + +}; + + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_histogram.cuh b/dnn/src/cuda/cub/block/block_histogram.cuh new file mode 100644 index 00000000..b7cb9700 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_histogram.cuh @@ -0,0 +1,415 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_histogram_sort.cuh" +#include "specializations/block_histogram_atomic.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. + */ +enum BlockHistogramAlgorithm +{ + + /** + * \par Overview + * Sorting followed by differentiation. Execution is comprised of two phases: + * -# Sort the data using efficient radix sort + * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + */ + BLOCK_HISTO_SORT, + + + /** + * \par Overview + * Use atomic addition to update byte counts directly + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + */ + BLOCK_HISTO_ATOMIC, +}; + + + +/****************************************************************************** + * Block histogram + ******************************************************************************/ + + +/** + * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) + * \ingroup BlockModule + * + * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam BINS The number bins within the histogram + * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * - BlockHistogram can be optionally specialized to use different algorithms: + * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) + * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockHistogram} + * \par + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char data[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(data, smem_histogram); + * + * \endcode + * + * \par Performance and Usage Considerations + * - The histogram output can be constructed in shared or device-accessible memory + * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + int BINS, + BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockHistogram +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used + * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used + * regardless. + */ + static const BlockHistogramAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? + BLOCK_HISTO_SORT : + ALGORITHM; + + /// Internal specialization. + typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT), + BlockHistogramSort, + BlockHistogramAtomic >::Type InternalBlockHistogram; + + /// Shared memory storage layout type for BlockHistogram + typedef typename InternalBlockHistogram::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /// \smemstorage{BlockHistogram} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockHistogram() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockHistogram( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Histogram operations + *********************************************************************/ + //@{ + + + /** + * \brief Initialize the shared histogram counters to zero. + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template + __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) + { + // Initialize histogram bin counts to zeros + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + histogram[histo_offset + linear_tid] = 0; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + histogram[histo_offset + linear_tid] = 0; + } + } + + + /** + * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template < + typename CounterT > + __device__ __forceinline__ void Histogram( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + // Initialize histogram bin counts to zeros + InitHistogram(histogram); + + CTA_SYNC(); + + // Composite the histogram + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + + + + /** + * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template < + typename CounterT > + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_load.cuh b/dnn/src/cuda/cub/block/block_load.cuh new file mode 100644 index 00000000..217f5212 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_load.cuh @@ -0,0 +1,1241 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for reading linear tiles of data into the CUDA thread block. + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[ITEM]; + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) + { + items[ITEM] = thread_itr[ITEM]; + } + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Internal implementation for load vectorization + */ +template < + CacheLoadModifier MODIFIER, + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void InternalLoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + // Biggest memory access word that T is a whole multiple of + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), + + VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? + 4 : + (TOTAL_WORDS % 2 == 0) ? + 2 : + 1, + + VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Vector items + Vector vec_items[VECTORS_PER_THREAD]; + + // Aliased input ptr + Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) + { + vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); + } +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned + * + * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void LoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); +} + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InputIteratorT thread_itr = block_itr + linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + InputIteratorT thread_itr = block_itr + linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) + { + items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; + } + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + InputIteratorT thread_itr = block_itr + warp_offset + tid ; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + InputIteratorT thread_itr = block_itr + warp_offset + tid ; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; + } + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group + +/** @} */ // end group UtilIo + + + +//----------------------------------------------------------------------------- +// Generic BlockLoad abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ +enum BlockLoadAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * directly from memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_LOAD_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * from memory using CUDA's built-in vectorized loads as a coalescing optimization. + * For example, ld.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector load width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p InputIteratorTis not a simple pointer type + * - The block input offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_LOAD_VECTORIZE, + + /** + * \par Overview + * + * A [striped arrangement](index.html#sec5sec3) of data is read + * efficiently from memory and then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + */ + BLOCK_LOAD_TRANSPOSE, + + + /** + * \par Overview + * + * A [warp-striped arrangement](index.html#sec5sec3) of data is + * read efficiently from memory and then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly larger latencies than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + * - Provisions more shared storage, but incurs smaller latencies than the + * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. + */ + BLOCK_LOAD_WARP_TRANSPOSE, + + + /** + * \par Overview + * + * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and then is locally transposed into a + * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory + * requirement, only one warp's worth of shared memory is provisioned and is + * subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. + */ + BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, +}; + + +/** + * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockLoad class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockLoad can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory using CUDA's built-in vectorized loads as a + * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockLoad} + * \par + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ +template < + typename InputT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockLoad +{ +private: + + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Load helper + template + struct LoadInternal; + + + /** + * BLOCK_LOAD_DIRECT specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_VECTORIZE specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template + __device__ __forceinline__ void Load( + InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template + __device__ __forceinline__ void Load( + const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename OffsetT> + __device__ __forceinline__ void Load( + CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); + } + + /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Load( + _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range (skips vectorization) + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + {}; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + {}; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + {}; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + }; + + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef LoadInternal InternalLoad; + + + /// Shared memory storage layout type + typedef typename InternalLoad::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + /// \smemstorage{BlockLoad} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockLoad() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockLoad( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Load a linear segment of items from memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads + * being unmasked to load portions of valid data (and other items remaining unassigned). + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., + * \p valid_items is \p 5, and the out-of-bounds default is \p -1. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads + * being unmasked to load portions of valid data (and other items are assigned \p -1) + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); + } + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_radix_rank.cuh b/dnn/src/cuda/cub/block/block_radix_rank.cuh new file mode 100644 index 00000000..c26451c6 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_radix_rank.cuh @@ -0,0 +1,696 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block + */ + +#pragma once + +#include + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_scan.cuh" +#include "../block/block_scan.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. + * \ingroup BlockModule + * + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RADIX_BITS The number of radix bits per digit place + * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * Blah... + * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par Examples + * \par + * - Example 1: Simple radix rank of 32-bit integer keys + * \code + * #include + * + * template + * __global__ void ExampleKernel(...) + * { + * + * \endcode + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool IS_DESCENDING, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRank +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + // Integer type for digit counters (to be packed into words of type PackedCounters) + typedef unsigned short DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte), + unsigned long long, + unsigned int>::Type PackedCounter; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // The number of packed counters per thread (plus one for padding) + PADDED_COUNTER_LANES = COUNTER_LANES + 1, + RAKING_SEGMENT = PADDED_COUNTER_LANES, + }; + +public: + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), + }; + +private: + + + /// BlockScan type + typedef BlockScan< + PackedCounter, + BLOCK_DIM_X, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScan; + + + /// Shared memory storage layout type for BlockRadixRank + struct __align__(16) _TempStorage + { + union Aliasable + { + DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; + + } aliasable; + + // Storage for scanning local ranks + typename BlockScan::TempStorage block_scan; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + /// Copy of raking segment, promoted to registers + PackedCounter cached_segment[RAKING_SEGMENT]; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /** + * Internal storage allocator + */ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Performs upsweep raking reduction, returning the aggregate + */ + __device__ __forceinline__ PackedCounter Upsweep() + { + PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; + PackedCounter *raking_ptr; + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data into registers + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + cached_segment[i] = smem_raking_ptr[i]; + } + raking_ptr = cached_segment; + } + else + { + raking_ptr = smem_raking_ptr; + } + + return internal::ThreadReduce(raking_ptr, Sum()); + } + + + /// Performs exclusive downsweep raking scan + __device__ __forceinline__ void ExclusiveDownsweep( + PackedCounter raking_partial) + { + PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; + + PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? + cached_segment : + smem_raking_ptr; + + // Exclusive raking downsweep scan + internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data back to smem + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + smem_raking_ptr[i] = cached_segment[i]; + } + } + } + + + /** + * Reset shared memory digit counters + */ + __device__ __forceinline__ void ResetCounters() + { + // Reset shared memory digit counters + #pragma unroll + for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) + { + *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; + } + } + + + /** + * Block-scan prefix callback + */ + struct PrefixCallBack + { + __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate) + { + PackedCounter block_prefix = 0; + + // Propagate totals in packed fields + #pragma unroll + for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) + { + block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); + } + + return block_prefix; + } + }; + + + /** + * Scan shared memory digit counters. + */ + __device__ __forceinline__ void ScanCounters() + { + // Upsweep scan + PackedCounter raking_partial = Upsweep(); + + // Compute exclusive sum + PackedCounter exclusive_partial; + PrefixCallBack prefix_call_back; + BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); + + // Downsweep scan with exclusive partial + ExclusiveDownsweep(exclusive_partial); + } + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit + DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem + + // Reset shared memory digit counters + ResetCounters(); + + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // Get digit + unsigned int digit = BFE(keys[ITEM], current_bit, num_bits); + + // Get sub-counter + unsigned int sub_counter = digit >> LOG_COUNTER_LANES; + + // Get counter lane + unsigned int counter_lane = digit & (COUNTER_LANES - 1); + + if (IS_DESCENDING) + { + sub_counter = PACKING_RATIO - 1 - sub_counter; + counter_lane = COUNTER_LANES - 1 - counter_lane; + } + + // Pointer to smem digit counter + digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; + + // Load thread-exclusive prefix + thread_prefixes[ITEM] = *digit_counters[ITEM]; + + // Store inclusive prefix + *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; + } + + CTA_SYNC(); + + // Scan shared memory counters + ScanCounters(); + + CTA_SYNC(); + + // Extract the local ranks of each key + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // Add in thread block exclusive prefix + ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; + } + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + // Rank keys + RankKeys(keys, ranks, current_bit, num_bits); + + // Get the inclusive and exclusive digit totals corresponding to the calling thread. + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the + // first counter column, resulting in unavoidable bank conflicts.) + unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); + unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); + + exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; + } + } + } +}; + + + + + +/** + * Radix-rank using match.any + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool IS_DESCENDING, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRankMatch +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + typedef int32_t RankT; + typedef int32_t DigitCounterT; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + PADDED_WARPS = ((WARPS & 0x1) == 0) ? + WARPS + 1 : + WARPS, + + COUNTERS = PADDED_WARPS * RADIX_DIGITS, + RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, + PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? + RAKING_SEGMENT + 1 : + RAKING_SEGMENT, + }; + +public: + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), + }; + +private: + + /// BlockScan type + typedef BlockScan< + DigitCounterT, + BLOCK_THREADS, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScanT; + + + /// Shared memory storage layout type for BlockRadixRank + struct __align__(16) _TempStorage + { + typename BlockScanT::TempStorage block_scan; + + union __align__(16) Aliasable + { + volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; + DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; + + } aliasable; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRankMatch( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + // Initialize shared digit counters + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; + + CTA_SYNC(); + + // Each warp will strip-mine its section of input, one strip at a time + + volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; + uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; + uint32_t lane_mask_lt = LaneMaskLt(); + + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // My digit + uint32_t digit = BFE(keys[ITEM], current_bit, num_bits); + + if (IS_DESCENDING) + digit = RADIX_DIGITS - digit - 1; + + // Mask of peers who have same digit as me + uint32_t peer_mask = MatchAny(digit); + + // Pointer to smem digit counter for this key + digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; + + // Number of occurrences in previous strips + DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; + + // Warp-sync + WARP_SYNC(0xFFFFFFFF); + + // Number of peers having same digit as me + int32_t digit_count = __popc(peer_mask); + + // Number of lower-ranked peers having same digit seen so far + int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); + + if (peer_digit_prefix == 0) + { + // First thread for each digit updates the shared warp counter + *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); + } + + // Warp-sync + WARP_SYNC(0xFFFFFFFF); + + // Number of prior keys having same digit + ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); + } + + CTA_SYNC(); + + // Scan warp counters + + DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; + + BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; + + CTA_SYNC(); + + // Seed ranks with counter values from previous warps + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + ranks[ITEM] += *digit_counters[ITEM]; + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + RankKeys(keys, ranks, current_bit, num_bits); + + // Get exclusive count for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; + } + } + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/block/block_radix_sort.cuh b/dnn/src/cuda/cub/block/block_radix_sort.cuh new file mode 100644 index 00000000..ac0c9f85 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_radix_sort.cuh @@ -0,0 +1,863 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. + */ + + +#pragma once + +#include "block_exchange.cuh" +#include "block_radix_rank.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) + * \ingroup BlockModule + * + * \tparam KeyT KeyT type + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam ValueT [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) + * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending order. It relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * - BlockRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Within each key, the implementation treats fixed-length + * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting + * method can only be applied to unsigned integral types, BlockRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * - \rowmajor + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockRadixSort} + * \par + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * ... + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ +template < + typename KeyT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + typename ValueT = NullType, + int RADIX_BITS = 4, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixSort +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + // Whether or not there are values to be trucked along with keys + KEYS_ONLY = Equals::VALUE, + }; + + // KeyT traits and unsigned bits type + typedef Traits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + + /// Ascending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + false, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + AscendingBlockRadixRank; + + /// Descending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + true, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + DescendingBlockRadixRank; + + /// BlockExchange utility type for keys + typedef BlockExchange BlockExchangeKeys; + + /// BlockExchange utility type for values + typedef BlockExchange BlockExchangeValues; + + /// Shared memory storage layout type + union _TempStorage + { + typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; + typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; + typename BlockExchangeKeys::TempStorage exchange_keys; + typename BlockExchangeValues::TempStorage exchange_values; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + /// Rank keys (specialized for ascending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type /*is_descending*/) + { + AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// Rank keys (specialized for descending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type /*is_descending*/) + { + DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) + __device__ __forceinline__ void ExchangeValues( + ValueT (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + { + CTA_SYNC(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); + } + + /// ExchangeValues (specialized for key-value sort, to-striped arrangement) + __device__ __forceinline__ void ExchangeValues( + ValueT (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + { + CTA_SYNC(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); + } + + /// ExchangeValues (specialized for keys-only sort) + template + __device__ __forceinline__ void ExchangeValues( + ValueT (&/*values*/)[ITEMS_PER_THREAD], + int (&/*ranks*/)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + {} + + /// Sort blocked arrangement + template + __device__ __forceinline__ void SortBlocked( + KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + CTA_SYNC(); + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit if done + if (begin_bit >= end_bit) break; + + CTA_SYNC(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + +public: + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Sort blocked -> striped arrangement + template + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + CTA_SYNC(); + + // Check if this is the last pass + if (begin_bit >= end_bit) + { + // Last pass exchanges keys through shared memory in striped arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); + + // Last pass exchanges through shared memory in striped arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit + break; + } + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + CTA_SYNC(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + /// \smemstorage{BlockRadixSort} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangements) + *********************************************************************/ + //@{ + + /** + * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + */ + __device__ __forceinline__ void Sort( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void Sort( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + /** + * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + */ + __device__ __forceinline__ void SortDescending( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + * + */ + __device__ __forceinline__ void SortDescending( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangement -> striped arrangement) + *********************************************************************/ + //@{ + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + +}; + +/** + * \example example_block_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_raking_layout.cuh b/dnn/src/cuda/cub/block/block_raking_layout.cuh new file mode 100644 index 00000000..35006168 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_raking_layout.cuh @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. + */ + + +#pragma once + +#include "../util_macro.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) + * \ingroup BlockModule + * + * \par Overview + * This type facilitates a shared memory usage pattern where a block of CUDA + * threads places elements into shared memory and then reduces the active + * parallelism to one "raking" warp of threads for serially aggregating consecutive + * sequences of shared items. Padding is inserted to eliminate bank conflicts + * (for most data types). + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_THREADS The thread block size in threads. + * \tparam PTX_ARCH [optional] \ptxversion + */ +template < + typename T, + int BLOCK_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +struct BlockRakingLayout +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// The total number of elements that need to be cooperatively reduced + SHARED_ELEMENTS = BLOCK_THREADS, + + /// Maximum number of warp-synchronous raking threads + MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), + + /// Number of raking elements per warp-synchronous raking thread (rounded up) + SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, + + /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) + RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, + + /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) + HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), + + /// Degree of bank conflicts (e.g., 4-way) + CONFLICT_DEGREE = (HAS_CONFLICTS) ? + (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : + 1, + + /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load + USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), + + /// Total number of elements in the raking grid + GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), + + /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) + UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), + }; + + + /** + * \brief Shared memory storage type + */ + struct __align__(16) _TempStorage + { + T buff[BlockRakingLayout::GRID_ELEMENTS]; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /** + * \brief Returns the location for the calling thread to place data into the grid + */ + static __device__ __forceinline__ T* PlacementPtr( + TempStorage &temp_storage, + unsigned int linear_tid) + { + // Offset for partial + unsigned int offset = linear_tid; + + // Add in one padding element for every segment + if (USE_SEGMENT_PADDING > 0) + { + offset += offset / SEGMENT_LENGTH; + } + + // Incorporating a block of padding partials every shared memory segment + return temp_storage.Alias().buff + offset; + } + + + /** + * \brief Returns the location for the calling thread to begin sequential raking + */ + static __device__ __forceinline__ T* RakingPtr( + TempStorage &temp_storage, + unsigned int linear_tid) + { + return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_reduce.cuh b/dnn/src/cuda/cub/block/block_reduce.cuh new file mode 100644 index 00000000..261f2ea6 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_reduce.cuh @@ -0,0 +1,607 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_reduce_raking.cuh" +#include "specializations/block_reduce_raking_commutative_only.cuh" +#include "specializations/block_reduce_warp_reductions.cuh" +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * BlockReduceAlgorithm enumerates alternative algorithms for parallel + * reduction across a CUDA thread block. + */ +enum BlockReduceAlgorithm +{ + + /** + * \par Overview + * An efficient "raking" reduction algorithm that only supports commutative + * reduction operators (true for most operations, e.g., addition). + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Threads in warps other than the first warp place + * their partial reductions into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within the first + * warp continue to accumulate by raking across segments of shared partial reductions + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE + * and is preferable when the reduction operator is commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, + + + /** + * \par Overview + * An efficient "raking" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. \blocked. + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a + * single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs more communication than BLOCK_REDUCE_RAKING + * and is only preferable when the reduction operator is non-commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING, + + + /** + * \par Overview + * A quick "tiled warp-reductions" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. + * + * \par + * Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style + * reduction within each warp. + * -# A propagation phase where the warp reduction outputs in each warp are + * updated with the aggregate from each preceding warp. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING + * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall + * throughput across the GPU. However turn-around latency may be lower and + * thus useful when the GPU is under-occupied. + */ + BLOCK_REDUCE_WARP_REDUCTIONS, +}; + + +/****************************************************************************** + * Block reduce + ******************************************************************************/ + +/** + * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being reduced + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - \rowmajor + * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: + * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Very efficient (only one synchronization barrier). + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - \p BLOCK_THREADS is a multiple of the architecture's warp size + * - Every thread has a valid input (i.e., full vs. partial-tiles) + * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockReduce} + * \par + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + typedef BlockReduceWarpReductions WarpReductions; + typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; + typedef BlockReduceRaking Raking; + + /// Internal specialization type + typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), + WarpReductions, + typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), + RakingCommutativeOnly, + Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking + + /// Shared memory storage layout type for BlockReduce + typedef typename InternalBlockReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockReduce() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, reduction_op); + return Reduce(partial, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * if (threadIdx.x < num_valid) thread_data = ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction functor + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + else + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + } + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); + } + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ T Sum( + T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, cub::Sum()); + return Sum(partial); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item (up to num_items) + * int thread_data; + * if (threadIdx.x < num_valid) + * thread_data = ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + else + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + } + + + //@} end member group +}; + +/** + * \example example_block_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_scan.cuh b/dnn/src/cuda/cub/block/block_scan.cuh new file mode 100644 index 00000000..27ea7ed4 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_scan.cuh @@ -0,0 +1,2126 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_scan_raking.cuh" +#include "specializations/block_scan_warp_scans.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. + */ +enum BlockScanAlgorithm +{ + + /** + * \par Overview + * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. + * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_raking.png + *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer longer turnaround latencies when the + * GPU is under-occupied, it can often provide higher overall throughput + * across the GPU when suitably occupied. + */ + BLOCK_SCAN_RAKING, + + + /** + * \par Overview + * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at + * the expense of higher register pressure. Raking threads preserve their + * "upsweep" segment of values in registers while performing warp-synchronous + * scan, allowing the "downsweep" not to re-read them from shared memory. + */ + BLOCK_SCAN_RAKING_MEMOIZE, + + + /** + * \par Overview + * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. + * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer lower overall throughput across the + * GPU because due to a heavy reliance on inefficient warpscans, it can + * often provide lower turnaround latencies when the GPU is under-occupied. + */ + BLOCK_SCAN_WARP_SCANS, +}; + + +/****************************************************************************** + * Block scan + ******************************************************************************/ + +/** + * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being scanned + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - \rowmajor + * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: + * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Invokes a minimal number of minimal block-wide synchronization barriers (only + * one or two depending on algorithm selection) + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Prefix sum variants (vs. generic scan) + * - \blocksize + * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockScan} + * \par + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. + * The corresponding output \p thread_data in those threads will be + * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy + * cannot be used with thread block sizes not a multiple of the + * architectural warp size. + */ + static const BlockScanAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? + BLOCK_SCAN_RAKING : + ALGORITHM; + + typedef BlockScanWarpScans WarpScans; + typedef BlockScanRaking Raking; + + /// Define the delegate type for the desired algorithm + typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), + WarpScans, + Raking>::Type InternalBlockScan; + + /// Shared memory storage layout type for BlockScan + typedef typename InternalBlockScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Public types + ******************************************************************************/ +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockScan() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, ..., 127. + * The output for the second segment will be 128, 129, ..., 255. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage.scan).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. + * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + + //@} end member group // Exclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. + * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group // Inclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage.scan).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. + * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + //@} end member group +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, single datum per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); + } + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + //@} end member group +#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Inclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + InclusiveScan(input, output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InclusiveScan(input, output, cub::Sum(), block_aggregate); + } + + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, ..., 128. + * The output for the second segment will be 129, 130, ..., 256. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0]); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be + * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage.scan).IncluisveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. + * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. + * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan (with no initial value) + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage.scan).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. + * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + //@} end member group + + +}; + +/** + * \example example_block_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_shuffle.cuh b/dnn/src/cuda/cub/block/block_shuffle.cuh new file mode 100644 index 00000000..a0cc71d2 --- /dev/null +++ b/dnn/src/cuda/cub/block/block_shuffle.cuh @@ -0,0 +1,305 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_arch.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * It is commonplace for blocks of threads to rearrange data items between + * threads. The BlockShuffle abstraction allows threads to efficiently shift items + * either (a) up to their successor or (b) down to their predecessor. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockShuffle +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + enum + { + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T prev[BLOCK_THREADS]; + T next[BLOCK_THREADS]; + }; + + +public: + + /// \smemstorage{BlockShuffle} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockShuffle() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockShuffle( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Shuffle movement + *********************************************************************/ + //@{ + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Offset( + T input, ///< [in] The input item from the calling thread (threadi) + T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 + int distance = 1) ///< [in] Offset distance (may be negative) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS)) + output = temp_storage[linear_tid + distance].prev; + } + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Rotate( + T input, ///< [in] The calling thread's input item + T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 + unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + unsigned int offset = threadIdx.x + distance; + if (offset >= BLOCK_THREADS) + offset -= BLOCK_THREADS; + + output = temp_storage[offset].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads + { + Up(input, prev); + block_suffix = temp_storage[BLOCK_THREADS - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads + { + Up(input, prev); + block_prefix = temp_storage[BLOCK_THREADS - 1].prev; + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/block_store.cuh b/dnn/src/cuda/cub/block/block_store.cuh new file mode 100644 index 00000000..648bf9ff --- /dev/null +++ b/dnn/src/cuda/cub/block/block_store.cuh @@ -0,0 +1,1000 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for writing linear segments of data from the CUDA thread block + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[ITEM] = items[ITEM]; + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) + { + thread_itr[ITEM] = items[ITEM]; + } + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, + * which is the default starting offset returned by \p cudaMalloc() + * + * \par + * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void StoreDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + enum + { + // Maximum CUDA vector size is 4 elements + MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), + + // Vector size must be a power of two and an even divisor of the items per thread + VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? + MAX_VEC_SIZE : + 1, + + VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Alias global pointer + Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); + + // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) + Vector raw_vector[VECTORS_PER_THREAD]; + T *raw_items = reinterpret_cast(raw_vector); + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + raw_items[ITEM] = items[ITEM]; + } + + // Direct-store using vector types + StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); +} + + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } + } +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } + } +} + + +//@} end member group + + +/** @} */ // end group UtilIo + + +//----------------------------------------------------------------------------- +// Generic BlockStore abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. + */ +enum BlockStoreAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written directly + * to memory using CUDA's built-in vectorized stores as a coalescing optimization. + * For example, st.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector store width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p OutputIteratorT is not a simple pointer type + * - The block output offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_STORE_VECTORIZE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_WARP_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * To reduce the shared memory requirement, only one warp's worth of shared + * memory is provisioned and is subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. + */ + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + +}; + + +/** + * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam T The type of data to be written. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockStore class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockStore can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is written directly to memory using CUDA's built-in vectorized stores as a + * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockStore} + * \par + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockStore +{ +private: + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Store helper + template + struct StoreInternal; + + + /** + * BLOCK_STORE_DIRECT specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_VECTORIZE specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) + __device__ __forceinline__ void Store( + T *block_ptr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToStriped(items); + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef StoreInternal InternalStore; + + + /// Shared memory storage layout type + typedef typename InternalStore::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + + /// \smemstorage{BlockStore} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockStore() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockStore( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Store items into a linear segment of memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items); + } + + /** + * \brief Store items into a linear segment of memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. + * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with + * only the first two threads being unmasked to store portions of valid data. + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_histogram_atomic.cuh b/dnn/src/cuda/cub/block/specializations/block_histogram_atomic.cuh new file mode 100644 index 00000000..29db0df7 --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_histogram_atomic.cuh @@ -0,0 +1,82 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template +struct BlockHistogramAtomic +{ + /// Shared memory storage layout type + struct TempStorage {}; + + + /// Constructor + __device__ __forceinline__ BlockHistogramAtomic( + TempStorage &temp_storage) + {} + + + /// Composite data onto an existing histogram + template < + typename T, + typename CounterT, + int ITEMS_PER_THREAD> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + // Update histogram + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + atomicAdd(histogram + items[i], 1); + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_histogram_sort.cuh b/dnn/src/cuda/cub/block/specializations/block_histogram_sort.cuh new file mode 100644 index 00000000..9ef417ad --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_histogram_sort.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template < + typename T, ///< Sample type + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int ITEMS_PER_THREAD, ///< The number of samples per thread + int BINS, ///< The number of bins into which histogram samples may fall + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockHistogramSort +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort< + T, + BLOCK_DIM_X, + ITEMS_PER_THREAD, + NullType, + 4, + (PTX_ARCH >= 350) ? true : false, + BLOCK_SCAN_WARP_SCANS, + cudaSharedMemBankSizeFourByte, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity< + T, + BLOCK_DIM_X, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockDiscontinuityT; + + /// Shared memory + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + unsigned int run_begin[BINS]; + unsigned int run_end[BINS]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockHistogramSort( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + // Composite data onto an existing histogram + template < + typename CounterT > + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + CTA_SYNC(); + + // Initialize the shared memory's run_begin and run_end for each bin + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + + CTA_SYNC(); + + int flags[ITEMS_PER_THREAD]; // unused + + // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; + + CTA_SYNC(); + + // Composite into histogram + histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + int thread_offset = histo_offset + linear_tid; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + + // Finish up with guarded composition if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + int thread_offset = histo_offset + linear_tid; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_reduce_raking.cuh b/dnn/src/cuda/cub/block/specializations/block_reduce_raking.cuh new file mode 100644 index 00000000..aff97fc9 --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_reduce_raking.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../block/block_raking_layout.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * + * Supports non-commutative binary reduction operators. Unlike commutative + * reduction operators (e.g., addition), the application of a non-commutative + * reduction operator (e.g, string concatenation) across a sequence of inputs must + * honor the relative ordering of items and partial reductions when applying the + * reduction operator. + * + * Compared to the implementation of BlockReduceRaking (which does not support + * non-commutative operators), this implementation requires a few extra + * rounds of inter-thread communication. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), + + /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two + WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, + + /// Whether or not accesses into smem are unguarded + RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, + + }; + + + /// Shared memory storage layout type + union _TempStorage + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) + { + // Update partial if addend is in range + if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) + { + T addend = raking_segment[ITERATION]; + partial = reduction_op(partial, addend); + } + return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T * /*raking_segment*/, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) + { + return partial; + } + + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool IS_FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + num_valid, + reduction_op); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + int valid_raking_threads = (IS_FULL_TILE) ? + RAKING_THREADS : + (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH; + + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + valid_raking_threads, + reduction_op); + + } + } + + return partial; + } + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + + return Reduce(partial, num_valid, reduction_op); + } + + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/dnn/src/cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh new file mode 100644 index 00000000..454fdafa --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -0,0 +1,199 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. + */ + +#pragma once + +#include "block_reduce_raking.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRakingCommutativeOnly +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values + typedef BlockReduceRaking FallBack; + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Whether or not to use fall-back + USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), + + /// Number of raking threads + RAKING_THREADS = WARP_THREADS, + + /// Number of threads actually sharing items with the raking threads + SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, + }; + + /// WarpReduce utility type + typedef WarpReduce WarpReduce; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Shared memory storage layout type + union _TempStorage + { + struct + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + }; + typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRakingCommutativeOnly( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Sum(partial); + } + } + + return partial; + } + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = internal::ThreadReduce(raking_segment, reduction_op, partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_reduce_warp_reductions.cuh b/dnn/src/cuda/cub/block/specializations/block_reduce_warp_reductions.cuh new file mode 100644 index 00000000..10ba303b --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -0,0 +1,218 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../warp/warp_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceWarpReductions +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// The logical warp size for warp reductions + LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + + /// Whether or not the logical warp size evenly divides the thread block size + EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) + }; + + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire thread block + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + int warp_id; + int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockReduceWarpReductions( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) + { + if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) + { + T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; + warp_aggregate = reduction_op(warp_aggregate, addend); + } + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) + { + return warp_aggregate; + } + + + /// Returns block-wide aggregate in thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + // Share lane aggregates + if (lane_id == 0) + { + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + } + + CTA_SYNC(); + + // Update total aggregate in warp 0, lane 0 + if (linear_tid == 0) + { + warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); + } + + return warp_aggregate; + } + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + int warp_offset = (warp_id * LOGICAL_WARP_SIZE); + int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? + LOGICAL_WARP_SIZE : + num_valid - warp_offset; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( + input, + warp_num_valid, + cub::Sum()); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + int warp_offset = warp_id * LOGICAL_WARP_SIZE; + int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? + LOGICAL_WARP_SIZE : + num_valid - warp_offset; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( + input, + warp_num_valid, + reduction_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_scan_raking.cuh b/dnn/src/cuda/cub/block/specializations/block_scan_raking.cuh new file mode 100644 index 00000000..a855cda0 --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_scan_raking.cuh @@ -0,0 +1,666 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + +/** + * \file + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../block/block_raking_layout.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../thread/thread_scan.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + */ +template < + typename T, ///< Data type being scanned + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanRaking +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), + }; + + /// WarpScan utility type + typedef WarpScan WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + T block_aggregate; ///< Block aggregate + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + T cached_segment[SEGMENT_LENGTH]; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /// Templated reduction + template + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type /*iteration*/) + { + if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) + { + T addend = raking_ptr[ITERATION]; + raking_partial = scan_op(raking_partial, addend); + } + + return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); + } + + + /// Templated reduction (base case) + template + __device__ __forceinline__ T GuardedReduce( + T* /*raking_ptr*/, ///< [in] Input array + ScanOp /*scan_op*/, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type /*iteration*/) + { + return raking_partial; + } + + + /// Templated copy + template + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type /*iteration*/) + { + out[ITERATION] = in[ITERATION]; + CopySegment(out, in, Int2Type()); + } + + + /// Templated copy (base case) + __device__ __forceinline__ void CopySegment( + T* /*out*/, ///< [out] Out array + T* /*in*/, ///< [in] Input array + Int2Type /*iteration*/) + {} + + + /// Performs upsweep raking reduction, returning the aggregate + template + __device__ __forceinline__ T Upsweep( + ScanOp scan_op) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data into registers + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + + T raking_partial = cached_segment[0]; + + return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); + } + + + /// Performs exclusive downsweep raking scan + template + __device__ __forceinline__ void ExclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Performs inclusive downsweep raking scan + template + __device__ __forceinline__ void InclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + exclusive_output = *placement_ptr; + } + } + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial); + } + + CTA_SYNC(); + + // Grab exclusive partial from shared memory + output = *placement_ptr; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial= Upsweep(scan_op); + + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial); + + // Broadcast aggregate to other threads + if (linear_tid == 0) + temp_storage.block_aggregate = block_aggregate; + } + + CTA_SYNC(); + + // Grab exclusive partial from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + output = scan_op(block_prefix, output); + if (linear_tid == 0) + output = block_prefix; + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); + + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, downsweep_prefix); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with exclusive warpscan partial + output = scan_op(block_prefix, output); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); + + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, downsweep_prefix); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans.cuh b/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans.cuh new file mode 100644 index 00000000..85e4d613 --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans.cuh @@ -0,0 +1,392 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScan; + + /// Shared memory storage layout type + + struct __align__(32) _TempStorage + { + T warp_aggregates[WARPS]; + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction + ScanOp /*scan_op*/, ///< [in] Binary scan operator + T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans2.cuh b/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans2.cuh new file mode 100644 index 00000000..4de7c69b --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans2.cuh @@ -0,0 +1,436 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScanT; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T warp_aggregates[WARPS]; + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans3.cuh b/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans3.cuh new file mode 100644 index 00000000..147ca4c5 --- /dev/null +++ b/dnn/src/cuda/cub/block/specializations/block_scan_warp_scans3.cuh @@ -0,0 +1,418 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, + + /// Number of outer scan warps + OUTER_WARPS = INNER_WARP_THREADS + }; + + /// Outer WarpScan utility type + typedef WarpScan OuterWarpScanT; + + /// Inner WarpScan utility type + typedef WarpScan InnerWarpScanT; + + typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; + + + /// Shared memory storage layout type + struct _TempStorage + { + union Aliasable + { + Uninitialized outer_warp_scan; ///< Buffer for warp-synchronous outer scans + typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan + + } aliasable; + + T warp_aggregates[OUTER_WARPS]; + + T block_aggregate; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS), + lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + { + temp_storage.warp_aggregates[warp_id] = inclusive_output; + } + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial (or assign it if partial is invalid) + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/cub.cuh b/dnn/src/cuda/cub/cub.cuh new file mode 100644 index 00000000..3ece0f65 --- /dev/null +++ b/dnn/src/cuda/cub/cub.cuh @@ -0,0 +1,95 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * CUB umbrella include file + */ + +#pragma once + + +// Block +#include "block/block_histogram.cuh" +#include "block/block_discontinuity.cuh" +#include "block/block_exchange.cuh" +#include "block/block_load.cuh" +#include "block/block_radix_rank.cuh" +#include "block/block_radix_sort.cuh" +#include "block/block_reduce.cuh" +#include "block/block_scan.cuh" +#include "block/block_store.cuh" +//#include "block/block_shift.cuh" + +// Device +#include "device/device_histogram.cuh" +#include "device/device_partition.cuh" +#include "device/device_radix_sort.cuh" +#include "device/device_reduce.cuh" +#include "device/device_run_length_encode.cuh" +#include "device/device_scan.cuh" +#include "device/device_segmented_radix_sort.cuh" +#include "device/device_segmented_reduce.cuh" +#include "device/device_select.cuh" +#include "device/device_spmv.cuh" + +// Grid +//#include "grid/grid_barrier.cuh" +#include "grid/grid_even_share.cuh" +#include "grid/grid_mapping.cuh" +#include "grid/grid_queue.cuh" + +// Thread +#include "thread/thread_load.cuh" +#include "thread/thread_operators.cuh" +#include "thread/thread_reduce.cuh" +#include "thread/thread_scan.cuh" +#include "thread/thread_store.cuh" + +// Warp +#include "warp/warp_reduce.cuh" +#include "warp/warp_scan.cuh" + +// Iterator +#include "iterator/arg_index_input_iterator.cuh" +#include "iterator/cache_modified_input_iterator.cuh" +#include "iterator/cache_modified_output_iterator.cuh" +#include "iterator/constant_input_iterator.cuh" +#include "iterator/counting_input_iterator.cuh" +#include "iterator/tex_obj_input_iterator.cuh" +#include "iterator/tex_ref_input_iterator.cuh" +#include "iterator/transform_input_iterator.cuh" + +// Util +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_device.cuh" +#include "util_macro.cuh" +#include "util_ptx.cuh" +#include "util_type.cuh" + diff --git a/dnn/src/cuda/cub/device/device_histogram.cuh b/dnn/src/cuda/cub/device/device_histogram.cuh new file mode 100644 index 00000000..a2556a6b --- /dev/null +++ b/dnn/src/cuda/cub/device/device_histogram.cuh @@ -0,0 +1,866 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_histogram.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * + * \par Usage Considerations + * \cdp_class{DeviceHistogram} + * + */ +struct DeviceHistogram +{ + /******************************************************************//** + * \name Evenly-segmented bin ranges + *********************************************************************/ + //@{ + + /** + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. + * + * \par + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a sequence of float samples + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_samples, + 1, + sizeof(SampleT) * num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. + * + * \par + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * size_t row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_row_samples, + num_rows, + row_stride_bytes, + stream, + debug_synchronous); + } + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); + * + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramEven( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + stream, + debug_synchronous); + } + + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), + * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + + //@} end member group + /******************************************************************//** + * \name Custom bin ranges + *********************************************************************/ + //@{ + + /** + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + * + * \par + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of an six-bin histogram + * from a sequence of float samples + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + d_levels1, + num_samples, + 1, + sizeof(SampleT) * num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + * + * \par + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * int row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ , , , , , , , ] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + d_levels1, + num_row_samples, + num_rows, + row_stride_bytes, + stream, + debug_synchronous); + } + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), + * // (0, 6, 7, 5),(3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); + * + * // d_histogram <-- [ [1, 3, 0, 1], + * // [3, 0, 0, 2], + * // [0, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramRange( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + d_levels, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + stream, + debug_synchronous); + } + + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), + * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] + * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [2, 3, 0, 1], + * // [3, 0, 0, 2], + * // [1, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + + + //@} end member group +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_partition.cuh b/dnn/src/cuda/cub/device/device_partition.cuh new file mode 100644 index 00000000..50535400 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_partition.cuh @@ -0,0 +1,273 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from + * a specified input sequence. + * + * \par Usage Considerations + * \cdp_class{DevicePartition} + * + * \par Performance + * \linear_performance{partition} + * + * \par + * The following chart illustrates DevicePartition::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected for the first partition. + * \plots_below + * + * \image html partition_if_int32_50_percent.png + * + */ +struct DevicePartition +{ + /** + * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated partition-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected for the first partition with 50% probability. + * + * \image html partition_if_int32_50_percent.png + * \image html partition_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability for the first partition: + * + * \image html partition_if_int32_5_percent.png + * \image html partition_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_partition_flagged.cu + * \example example_device_partition_if.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_radix_sort.cuh b/dnn/src/cuda/cub/device/device_radix_sort.cuh new file mode 100644 index 00000000..1c0bdbea --- /dev/null +++ b/dnn/src/cuda/cub/device/device_radix_sort.cuh @@ -0,0 +1,797 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) + * \ingroup SingleModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceRadixSort} + * + * \par Performance + * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys + * performance across different CUDA architectures for uniform-random \p uint32 keys. + * \plots_below + * + * \image html lsb_radix_sort_int32_keys.png + * + */ +struct DeviceRadixSort +{ + + /******************************************************************//** + * \name KeyT-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +/** + * \example example_device_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_reduce.cuh b/dnn/src/cuda/cub/device/device_reduce.cuh new file mode 100644 index 00000000..13c7a72d --- /dev/null +++ b/dnn/src/cuda/cub/device/device_reduce.cuh @@ -0,0 +1,734 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceReduce} + * + * \par Performance + * \linear_performance{reduction, reduce-by-key, and run-length encode} + * + * \par + * The following chart illustrates DeviceReduce::Sum + * performance across different CUDA architectures for \p int32 keys. + * + * \image html reduce_int32.png + * + * \par + * The following chart illustrates DeviceReduce::ReduceByKey (summation) + * performance across different CUDA architectures for \p fp32 + * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceReduce +{ + /** + * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * __device__ __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * CustomMin min_op; + * int init; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + T init, ///< [in] Initial value of the reduction + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + init, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide sum using the addition (\p +) operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction. + * - Does not support \p + operators that are non-commutative.. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sum-reduction performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html reduce_int32.png + * \image html reduce_int64.png + * + * \par Snippet + * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction. + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // d_out <-- [{5, 0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // d_out <-- [9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // d_out <-- [{6, 9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. + * + * \par + * This operation computes segmented reductions within \p d_values_in using + * the specified binary \p reduction_op functor. The segments are identified by + * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of + * consecutive, identical keys. For the ith run encountered, + * the first key of the run and the corresponding value aggregate of that run are + * written to d_unique_out[i] and d_aggregates_out[i], + * respectively. The total number of runs encountered is written to \p d_num_runs_out. + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following chart illustrates reduction-by-key (sum) performance across + * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments + * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * \image html reduce_by_key_fp64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html reduce_by_key_fp32_len_5.png + * \image html reduce_by_key_fp64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the segmented reduction of \p int values grouped + * by runs of associated \p int keys. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] + * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_num_runs_out; // e.g., [-] + * CustomMin reduction_op; + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduce-by-key + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_aggregates_out <-- [0, 1, 6, 2, 4] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator + * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator + * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename ReductionOpT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t ReduceByKey( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // FlagT iterator type (not used) + + // Selection op (not used) + + // Default == operator + typedef Equality EqualityOp; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + EqualityOp(), + reduction_op, + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_run_length_encode.cuh b/dnn/src/cuda/cub/device/device_run_length_encode.cuh new file mode 100644 index 00000000..7a2e82d9 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_run_length_encode.cuh @@ -0,0 +1,278 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_rle.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A run-length encoding + * computes a simple compressed representation of a sequence of input elements such that each + * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a + * count of the elements in that run. + * + * \par Usage Considerations + * \cdp_class{DeviceRunLengthEncode} + * + * \par Performance + * \linear_performance{run-length encode} + * + * \par + * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across + * different CUDA architectures for \p int32 items. + * Segments have lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceRunLengthEncode +{ + + /** + * \brief Computes a run-length encoding of the sequence \p d_in. + * + * \par + * - For the ith run encountered, the first key of the run and its length are written to + * d_unique_out[i] and d_counts_out[i], + * respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated encode performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * \image html rle_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html rle_int32_len_5.png + * \image html rle_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the run-length encoding of a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_unique_out; // e.g., [ , , , , , , , ] + * int *d_counts_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_counts_out <-- [1, 2, 1, 3, 1] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output items \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing output counts \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename UniqueOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Encode( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + typedef cub::Sum ReductionOp; // Value reduction operator + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + // Generator type for providing 1s values for run-length reduction + typedef ConstantInputIterator LengthsInputIteratorT; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + LengthsInputIteratorT((LengthT) 1), + d_counts_out, + d_num_runs_out, + EqualityOp(), + ReductionOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in. + * + * \par + * - For the ith non-trivial run, the run's starting offset + * and its length are written to d_offsets_out[i] and + * d_lengths_out[i], respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * + * \par Snippet + * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_offsets_out; // e.g., [ , , , , , , , ] + * int *d_lengths_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // d_offsets_out <-- [1, 4] + * // d_lengths_out <-- [2, 3] + * // d_num_runs_out <-- [2] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OffsetsOutputIteratorT [inferred] Random-access output iterator type for writing run-offset values \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing run-length values \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename OffsetsOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t NonTrivialRuns( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run) + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef Equality EqualityOp; // Default == operator + + return DeviceRleDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_scan.cuh b/dnn/src/cuda/cub/device/device_scan.cuh new file mode 100644 index 00000000..e86fefe3 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_scan.cuh @@ -0,0 +1,443 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_scan.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) + * \ingroup SingleModule + * + * \par Overview + * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output sequence where each element is computed to be the reduction + * of the elements occurring earlier in the input sequence. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * + * \par + * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm + * for performing global prefix scan with only a single pass through the + * input data, as described in our 2016 technical report [1]. The central + * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies + * of global prefix propagation with local computation. As such, our algorithm requires only + * ~2n data movement (n inputs are read, n outputs are written), and typically + * proceeds at "memcpy" speeds. + * + * \par + * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) + * + * \par Usage Considerations + * \cdp_class{DeviceScan} + * + * \par Performance + * \linear_performance{prefix scan} + * + * \par + * The following chart illustrates DeviceScan::ExclusiveSum + * performance across different CUDA architectures for \p int32 keys. + * \plots_below + * + * \image html scan_int32.png + * + */ +struct DeviceScan +{ + /******************************************************************//** + * \name Exclusive scans + *********************************************************************/ + //@{ + + /** + * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated exclusive sum performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html scan_int32.png + * \image html scan_int64.png + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveSum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Initial value + OutputT init_value = 0; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + init_value, + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op + * ... + * + * // Determine temporary device storage requirements for exclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide inclusive prefix sum. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive prefix sum + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [8, 14, 21, 26, 29, 29, 38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveSum( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // d_out <-- [8, 6, 6, 5, 3, 0, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + + //@} end member group + +}; + +/** + * \example example_device_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_segmented_radix_sort.cuh b/dnn/src/cuda/cub/device/device_segmented_radix_sort.cuh new file mode 100644 index 00000000..0d360762 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_segmented_radix_sort.cuh @@ -0,0 +1,876 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedRadixSort} + * + */ +struct DeviceSegmentedRadixSort +{ + + /******************************************************************//** + * \name Key-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts segments of keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts segments of keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_segmented_reduce.cuh b/dnn/src/cuda/cub/device/device_segmented_reduce.cuh new file mode 100644 index 00000000..6c3b54a0 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_segmented_reduce.cuh @@ -0,0 +1,619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedReduce} + * + */ +struct DeviceSegmentedReduce +{ + /** + * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * CustomMin min_op; + * int initial_value; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT, + typename ReductionOp, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOp reduction_op, ///< [in] Binary reduction functor + T initial_value, ///< [in] Initial value of the reduction for each segment + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented sum using the addition ('+') operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p + operators that are non-commutative.. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [21, 0, 17] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [8, INT_MIN, 9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_select.cuh b/dnn/src/cuda/cub/device/device_select.cuh new file mode 100644 index 00000000..52a3e126 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_select.cuh @@ -0,0 +1,369 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to selectively copy + * items from a specified input sequence to a compact output sequence. + * + * \par Usage Considerations + * \cdp_class{DeviceSelect} + * + * \par Performance + * \linear_performance{select-flagged, select-if, and select-unique} + * + * \par + * The following chart illustrates DeviceSelect::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected. + * + * \image html select_if_int32_50_percent.png + * + * \par + * The following chart illustrates DeviceSelect::Unique + * performance across different CUDA architectures for \p int32 items + * where segments have lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceSelect +{ + /** + * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected with 50% probability. + * + * \image html select_if_int32_50_percent.png + * \image html select_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability: + * + * \image html select_if_int32_5_percent.png + * \image html select_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-unique performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * \image html select_unique_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html select_unique_int32_len_5.png + * \image html select_unique_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [0, 2, 9, 5, 8] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Unique( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_select_flagged.cu + * \example example_device_select_if.cu + * \example example_device_select_unique.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/device_spmv.cuh b/dnn/src/cuda/cub/device/device_spmv.cuh new file mode 100644 index 00000000..63b6a7e8 --- /dev/null +++ b/dnn/src/cuda/cub/device/device_spmv.cuh @@ -0,0 +1,174 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_spmv_orig.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). + * \ingroup SingleModule + * + * \par Overview + * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) + * performs the matrix-vector operation + * y = alpha*A*x + beta*y, + * where: + * - A is an mxn sparse matrix whose non-zero structure is specified in + * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) + * (i.e., three arrays: values, row_offsets, and column_indices) + * - x and y are dense vectors + * - alpha and beta are scalar multiplicands + * + * \par Usage Considerations + * \cdp_class{DeviceSpmv} + * + */ +struct DeviceSpmv +{ + /******************************************************************//** + * \name CSR matrix operations + *********************************************************************/ + //@{ + + /** + * \brief This function performs the matrix-vector operation y = A*x. + * + * \par Snippet + * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A + * representing a 3x3 lattice (24 non-zeros). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, + * // and output vector y + * int num_rows = 9; + * int num_cols = 9; + * int num_nonzeros = 24; + * + * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1] + * + * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, + * // 4, 6, 1, 3, 5, 7, 2, 4, + * // 8, 3, 7, 4, 6, 8, 5, 7] + * + * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] + * + * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] + * float* d_vector_y; // e.g., [ , , , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run SpMV + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] + * + * \endcode + * + * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) + */ + template < + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t CsrMV( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) + int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows, ///< [in] number of rows of matrix A. + int num_cols, ///< [in] number of columns of matrix A. + int num_nonzeros, ///< [in] number of nonzero elements of matrix A. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + SpmvParams spmv_params; + spmv_params.d_values = d_values; + spmv_params.d_row_end_offsets = d_row_offsets + 1; + spmv_params.d_column_indices = d_column_indices; + spmv_params.d_vector_x = d_vector_x; + spmv_params.d_vector_y = d_vector_y; + spmv_params.num_rows = num_rows; + spmv_params.num_cols = num_cols; + spmv_params.num_nonzeros = num_nonzeros; + spmv_params.alpha = 1.0; + spmv_params.beta = 0.0; + + return DispatchSpmv::Dispatch( + d_temp_storage, + temp_storage_bytes, + spmv_params, + stream, + debug_synchronous); + } + + //@} end member group +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_histogram.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_histogram.cuh new file mode 100644 index 00000000..ab08e8ed --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_histogram.cuh @@ -0,0 +1,1096 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../../agent/agent_histogram.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Histogram kernel entry points + *****************************************************************************/ + +/** + * Histogram initialization kernel entry point + */ +template < + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename OffsetT> ///< Signed integer type for global offsets +__global__ void DeviceHistogramInitKernel( + ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel + ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + if ((threadIdx.x == 0) && (blockIdx.x == 0)) + tile_queue.ResetDrain(); + + int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; + + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + if (output_bin < num_output_bins_wrapper.array[CHANNEL]) + d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; + } +} + + +/** + * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< The input iterator type. \iterator. + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) +__global__ void DeviceHistogramSweepKernel( + SampleIteratorT d_samples, ///< Input data to reduce + ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram + ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram + ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms + ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms + ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for compositing input tiles + typedef AgentHistogram< + AgentHistogramPolicyT, + PRIVATIZED_SMEM_BINS, + NUM_CHANNELS, + NUM_ACTIVE_CHANNELS, + SampleIteratorT, + CounterT, + PrivatizedDecodeOpT, + OutputDecodeOpT, + OffsetT> + AgentHistogramT; + + // Shared memory for AgentHistogram + __shared__ typename AgentHistogramT::TempStorage temp_storage; + + AgentHistogramT agent( + temp_storage, + d_samples, + num_output_bins_wrapper.array, + num_privatized_bins_wrapper.array, + d_output_histograms_wrapper.array, + d_privatized_histograms_wrapper.array, + output_decode_op_wrapper.array, + privatized_decode_op_wrapper.array); + + // Initialize counters + agent.InitBinCounters(); + + // Consume input tiles + agent.ConsumeTiles( + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Store output to global (if necessary) + agent.StoreOutput(); + +} + + + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram + */ +template < + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename LevelT, ///< Type for specifying bin level boundaries + typename OffsetT> ///< Signed integer type for global offsets +struct DipatchHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + enum + { + // Maximum number of bins per channel for which we will use a privatized smem strategy + MAX_PRIVATIZED_SMEM_BINS = 256 + }; + + + //--------------------------------------------------------------------- + // Transform functors for converting samples to bin-ids + //--------------------------------------------------------------------- + + // Searches for bin given a list of bin-boundary levels + template + struct SearchTransform + { + LevelIteratorT d_levels; // Pointer to levels array + int num_output_levels; // Number of levels in array + + // Initializer + __host__ __device__ __forceinline__ void Init( + LevelIteratorT d_levels, // Pointer to levels array + int num_output_levels) // Number of levels in array + { + this->d_levels = d_levels; + this->num_output_levels = num_output_levels; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + /// Level iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + LevelIteratorT>::Type // Directly use the supplied input iterator type + WrappedLevelIteratorT; + + WrappedLevelIteratorT wrapped_levels(d_levels); + + int num_bins = num_output_levels - 1; + if (valid) + { + bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; + if (bin >= num_bins) + bin = -1; + } + } + }; + + + // Scales samples to evenly-spaced bins + struct ScaleTransform + { + int num_bins; // Number of levels in array + LevelT max; // Max sample level (exclusive) + LevelT min; // Min sample level (inclusive) + LevelT scale; // Bin scaling factor + + // Initializer + template + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + _LevelT max, // Max sample level (exclusive) + _LevelT min, // Min sample level (inclusive) + _LevelT scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = scale; + } + + // Initializer (float specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + float max, // Max sample level (exclusive) + float min, // Min sample level (inclusive) + float scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = float(1.0) / scale; + } + + // Initializer (double specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + double max, // Max sample level (exclusive) + double min, // Min sample level (inclusive) + double scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = double(1.0) / scale; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) / scale); + } + + // Method for converting samples to bin-ids (float specialization) + template + __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + + // Method for converting samples to bin-ids (double specialization) + template + __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + }; + + + // Pass-through bin transform operator + struct PassThruTransform + { + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + if (valid) + bin = (int) sample; + } + }; + + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + template + struct TScale + { + enum + { + V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), + VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) + }; + }; + + + /// SM11 + struct Policy110 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + (NUM_CHANNELS == 1) ? 256 : 128, + (NUM_CHANNELS == 1) ? 8 : 3, + (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM35 + struct Policy350 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 128, + TScale<8>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLEND, + true> + HistogramSweepPolicy; + }; + + /// SM50 + struct Policy500 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 384, + TScale<16>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int ptx_version, + KernelConfig &histogram_sweep_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + return histogram_sweep_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 350) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 300) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 200) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 110) + { + return histogram_sweep_config.template Init(); + } + else + { + // No global atomic support + return cudaErrorNotSupported; + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int pixels_per_thread; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; + + return cudaSuccess; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Privatization-based dispatch routine + */ + template < + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel + typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t PrivatizedDispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int max_num_output_bins, ///< [in] Maximum number of output bins in any channel + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel + DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel + KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + #ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + + #else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get SM occupancy for histogram_sweep_kernel + int histogram_sweep_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + histogram_sweep_sm_occupancy, + histogram_sweep_kernel, + histogram_sweep_config.block_threads))) break; + + // Get device occupancy for histogram_sweep_kernel + int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; + + if (num_row_pixels * NUM_CHANNELS == row_stride_samples) + { + // Treat as a single linear array of samples + num_row_pixels *= num_rows; + num_rows = 1; + row_stride_samples = num_row_pixels * NUM_CHANNELS; + } + + // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy + int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; + int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; + int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); + int blocks_per_col = (blocks_per_row > 0) ? + int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : + 0; + int num_thread_blocks = blocks_per_row * blocks_per_col; + + dim3 sweep_grid_dims; + sweep_grid_dims.x = (unsigned int) blocks_per_row; + sweep_grid_dims.y = (unsigned int) blocks_per_col; + sweep_grid_dims.z = 1; + + // Temporary storage allocation requirements + const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; + void* allocations[NUM_ALLOCATIONS]; + size_t allocation_sizes[NUM_ALLOCATIONS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); + + allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the grid queue descriptor + GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); + + // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_output_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; + + // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_privatized_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; + + // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper privatized_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; + + // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper output_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; + + // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_privatized_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; + + // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_output_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; + + int histogram_init_block_threads = 256; + int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; + + // Log DeviceHistogramInitKernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", + histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); + + // Invoke histogram_init_kernel + histogram_init_kernel<<>>( + num_output_bins_wrapper, + d_output_histograms_wrapper, + tile_queue); + + // Return if empty problem + if ((blocks_per_row == 0) || (blocks_per_col == 0)) + break; + + // Log histogram_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", + sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, + histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); + + // Invoke histogram_sweep_kernel + histogram_sweep_kernel<<>>( + d_samples, + num_output_bins_wrapper, + num_privatized_bins_wrapper, + d_output_histograms_wrapper, + d_privatized_histograms_wrapper, + output_decode_op_wrapper, + privatized_decode_op_wrapper, + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + + #endif // CUB_RUNTIME_ENABLED + } + + + + /** + * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the search transform op for converting samples to privatized bins + typedef SearchTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + // Dispatch + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Too many bins to keep in shared memory. + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the search transform op for converting privatized bins to output bins + typedef SearchTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; // Maximum number of levels in any channel + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the scale transform op for converting samples to privatized bins + typedef ScaleTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + + privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + } + while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the scale transform op for converting privatized bins to output bins + typedef ScaleTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } + while (0); + + return error; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_radix_sort.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_radix_sort.cuh new file mode 100644 index 00000000..d1a992d4 --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_radix_sort.cuh @@ -0,0 +1,1619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_radix_sort_upsweep.cuh" +#include "../../agent/agent_radix_sort_downsweep.cuh" +#include "../../agent/agent_scan.cuh" +#include "../../block/block_radix_sort.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortUpsweepKernel( + const KeyT *d_keys, ///< [in] Input keys buffer + OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT /*num_items*/, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortUpsweep type for the current configuration + typedef AgentRadixSortUpsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, + typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type, + KeyT, + OffsetT> + AgentRadixSortUpsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; + + // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block + even_share.template BlockInit(); + + AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); + + upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); + + CTA_SYNC(); + + // Write out digit counts (striped) + upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); +} + + +/** + * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortScanBinsKernel( + OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + int num_counts) ///< [in] Total number of bin-counts +{ + // Parameterize the AgentScan type for the current configuration + typedef AgentScan< + typename ChainedPolicyT::ActivePolicy::ScanPolicy, + OffsetT*, + OffsetT*, + cub::Sum, + OffsetT, + OffsetT> + AgentScanT; + + // Shared memory storage + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Block scan instance + AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; + + // Process full input tiles + int block_offset = 0; + BlockScanRunningPrefixOp prefix_op(0, Sum()); + while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) + { + block_scan.template ConsumeTile(block_offset, prefix_op); + block_offset += AgentScanT::TILE_ITEMS; + } +} + + +/** + * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortDownsweepKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortDownsweep type for the current configuration + typedef AgentRadixSortDownsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, + typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type, + IS_DESCENDING, + KeyT, + ValueT, + OffsetT> + AgentRadixSortDownsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.template BlockInit(); + + // Process input tiles + AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end); +} + + +/** + * Single pass kernel entry point (single-block). Fully sorts a tile of input. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceRadixSortSingleTileKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison +{ + // Constants + enum + { + BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, + KEYS_ONLY = Equals::VALUE, + }; + + // BlockRadixSort type + typedef BlockRadixSort< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ValueT, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, + (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> + BlockRadixSortT; + + // BlockLoad type (keys) + typedef BlockLoad< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; + + // Unsigned word for key bits + typedef typename Traits::UnsignedBits UnsignedBitsT; + + // Shared memory storage + __shared__ union TempStorage + { + typename BlockRadixSortT::TempStorage sort; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + + } temp_storage; + + // Keys and values for the block + KeyT keys[ITEMS_PER_THREAD]; + ValueT values[ITEMS_PER_THREAD]; + + // Get default (min/max) value for out-of-bounds keys + UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; + KeyT default_key = reinterpret_cast(default_key_bits); + + // Load keys + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); + + CTA_SYNC(); + + // Load values + if (!KEYS_ONLY) + { + // Register pressure work-around: moving num_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + num_items = ShuffleIndex(num_items, 0, 0xffffffff); + + BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); + + CTA_SYNC(); + } + + // Sort tile + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( + keys, + values, + current_bit, + end_bit, + Int2Type(), + Int2Type()); + + // Store keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; + if (item_offset < num_items) + { + d_keys_out[item_offset] = keys[ITEM]; + if (!KEYS_ONLY) + d_values_out[item_offset] = values[ITEM]; + } + } +} + + +/** + * Segmented radix sorting pass (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedRadixSortKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + int current_bit, ///< [in] Bit position of current radix digit + int pass_bits) ///< [in] Number of bits of current radix digit +{ + // + // Constants + // + + typedef typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, + typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT; + + enum + { + BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, + RADIX_BITS = SegmentedPolicyT::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Upsweep type + typedef AgentRadixSortUpsweep< + AgentRadixSortUpsweepPolicy, + KeyT, + OffsetT> + BlockUpsweepT; + + // Digit-scan type + typedef BlockScan DigitScanT; + + // Downsweep type + typedef AgentRadixSortDownsweep BlockDownsweepT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD + }; + + // + // Process input tiles + // + + // Shared memory storage + __shared__ union + { + typename BlockUpsweepT::TempStorage upsweep; + typename BlockDownsweepT::TempStorage downsweep; + struct + { + volatile OffsetT reverse_counts_in[RADIX_DIGITS]; + volatile OffsetT reverse_counts_out[RADIX_DIGITS]; + typename DigitScanT::TempStorage scan; + }; + + } temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + OffsetT num_items = segment_end - segment_begin; + + // Check if empty segment + if (num_items <= 0) + return; + + // Upsweep + BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); + upsweep.ProcessRegion(segment_begin, segment_end); + + CTA_SYNC(); + + // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) + OffsetT bin_count[BINS_TRACKED_PER_THREAD]; + upsweep.ExtractCounts(bin_count); + + CTA_SYNC(); + + if (IS_DESCENDING) + { + // Reverse bin counts + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; + } + } + + // Scan + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) + DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + bin_offset[track] += segment_begin; + } + + if (IS_DESCENDING) + { + // Reverse bin offsets + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; + } + } + + CTA_SYNC(); + + // Downsweep + BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); + downsweep.ProcessRegion(segment_begin, segment_end); +} + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +/** + * Tuning policy for kernel specialization + */ +template < + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + // Dominant-sized key/value type + typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT; + + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) + }; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef DownsweepPolicyKeys DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + + + }; + + + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM60 (GP100) + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + + }; + + + /// SM61 (GP104) + struct Policy610 : ChainedPolicy<610, Policy610, Policy600> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef AgentRadixSortUpsweepPolicy UpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM62 (Tegra, less RF) + struct Policy620 : ChainedPolicy<620, Policy620, Policy610> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM70 (GV100) + struct Policy700 : ChainedPolicy<700, Policy700, Policy620> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// MaxPolicy + typedef Policy700 MaxPolicy; + + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + begin_bit(begin_bit), + end_bit(end_bit), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version), + is_overwrite_okay(is_overwrite_okay) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block to sort in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Log single_tile_kernel configuration + if (debug_synchronous) + _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_keys.Current(), + d_keys.Alternate(), + d_values.Current(), + d_values.Alternate(), + num_items, + begin_bit, + end_bit); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update selector + d_keys.selector ^= 1; + d_values.selector ^= 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation + //------------------------------------------------------------------------------ + + /** + * Invoke a three-kernel sorting pass at the current bit. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + OffsetT *d_spine, + int spine_length, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log upsweep_kernel configuration + if (debug_synchronous) + _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, + pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + pass_config.upsweep_kernel<<>>( + d_keys_in, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); + + // Invoke scan_kernel + pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( + d_spine, + spine_length); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log downsweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, + pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); + + // Invoke downsweep_kernel + pass_config.downsweep_kernel<<>>( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + + /// Pass configuration structure + template < + typename UpsweepKernelT, + typename ScanKernelT, + typename DownsweepKernelT> + struct PassConfig + { + UpsweepKernelT upsweep_kernel; + KernelConfig upsweep_config; + ScanKernelT scan_kernel; + KernelConfig scan_config; + DownsweepKernelT downsweep_kernel; + KernelConfig downsweep_config; + int radix_bits; + int radix_digits; + int max_downsweep_grid_size; + GridEvenShare even_share; + + /// Initialize pass configuration + template < + typename UpsweepPolicyT, + typename ScanPolicyT, + typename DownsweepPolicyT> + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig( + UpsweepKernelT upsweep_kernel, + ScanKernelT scan_kernel, + DownsweepKernelT downsweep_kernel, + int ptx_version, + int sm_count, + int num_items) + { + cudaError error = cudaSuccess; + do + { + this->upsweep_kernel = upsweep_kernel; + this->scan_kernel = scan_kernel; + this->downsweep_kernel = downsweep_kernel; + radix_bits = DownsweepPolicyT::RADIX_BITS; + radix_digits = 1 << radix_bits; + + if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; + if (CubDebug(error = scan_config.Init(scan_kernel))) break; + if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; + + max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); + + even_share.DispatchInit( + num_items, + max_downsweep_grid_size, + CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + + } + while (0); + return error; + } + + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename ScanKernelT, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)upsweep_kernel; + (void)alt_upsweep_kernel; + (void)scan_kernel; + (void)downsweep_kernel; + (void)alt_downsweep_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular and alternate-digit kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig< + typename ActivePolicyT::UpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::DownsweepPolicy>( + upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; + + if ((error = alt_pass_config.template InitPassConfig< + typename ActivePolicyT::AltUpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::AltDownsweepPolicy>( + alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; + + // Get maximum spine length + int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); + int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[3]; + size_t allocation_sizes[3] = + { + spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); + + // Alias the temporary storage allocations + OffsetT *d_spine = static_cast(allocations[0]); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; + + // Invert selectors + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceRadixSortSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, + DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, + RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items, ///< [in] Number of items to sort + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSegmentedRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Parameter members + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructors + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + begin_bit(begin_bit), + end_bit(end_bit), + is_overwrite_okay(is_overwrite_okay), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Multi-segment invocation + //------------------------------------------------------------------------------ + + /// Invoke a three-kernel sorting pass at the current bit. + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log kernel configuration + if (debug_synchronous) + _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + num_segments, pass_config.segmented_config.block_threads, (long long) stream, + pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); + + pass_config.segmented_kernel<<>>( + d_keys_in, d_keys_out, + d_values_in, d_values_out, + d_begin_offsets, d_end_offsets, num_segments, + current_bit, pass_bits); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + /// PassConfig data structure + template + struct PassConfig + { + SegmentedKernelT segmented_kernel; + KernelConfig segmented_config; + int radix_bits; + int radix_digits; + + /// Initialize pass configuration + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) + { + this->segmented_kernel = segmented_kernel; + this->radix_bits = SegmentedPolicyT::RADIX_BITS; + this->radix_digits = 1 << radix_bits; + + return CubDebug(segmented_config.Init(segmented_kernel)); + } + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_kernel; + (void)alt_segmented_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Init regular and alternate kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; + if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + if (temp_storage_bytes == 0) + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; + int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + radix_bits - 1) / radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Invert selectors and update current bit + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedRadixSortKernel, + DeviceSegmentedRadixSortKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + + /// Internal dispatch routine + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_reduce.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_reduce.cuh new file mode 100644 index 00000000..e9d1b7ac --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_reduce.cuh @@ -0,0 +1,882 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_reduce.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + ReductionOpT reduction_op) ///< [in] Binary reduction functor +{ + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); + + // Output result + if (threadIdx.x == 0) + d_out[blockIdx.x] = block_aggregate; +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceReduceSingleTileKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OuputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) + *d_out = init; + return; + } + + // Consume input tiles + OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + OffsetT(0), + num_items); + + // Output result + if (threadIdx.x == 0) + *d_out = reduction_op(init, block_aggregate); +} + + +/// Normalize input iterator to segment offset +template +__device__ __forceinline__ +void NormalizeReductionOutput( + T &/*val*/, + OffsetT /*base_offset*/, + IteratorT /*itr*/) +{} + + +/// Normalize input iterator to segment offset (specialized for arg-index) +template +__device__ __forceinline__ +void NormalizeReductionOutput( + KeyValuePairT &val, + OffsetT base_offset, + ArgIndexInputIterator /*itr*/) +{ + val.key -= base_offset; +} + + +/** + * Segmented reduction (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + + // Check if empty problem + if (segment_begin == segment_end) + { + if (threadIdx.x == 0) + d_out[blockIdx.x] = init; + return; + } + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + segment_begin, + segment_end); + + // Normalize as needed + NormalizeReductionOutput(block_aggregate, segment_begin, d_in); + + if (threadIdx.x == 0) + d_out[blockIdx.x] = reduction_op(init, block_aggregate);; +} + + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template < + typename OuputT, ///< Data type + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DeviceReducePolicy +{ + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM13 + struct Policy130 : ChainedPolicy<130, Policy130, Policy130> + { + // ReducePolicy + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + { + // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + // ReducePolicy (GTX670: 154.0 @ 48M 4B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + /// SM60 + struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 16, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// MaxPolicy + typedef Policy600 MaxPolicy; + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchReduce : + DeviceReducePolicy< + typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + // Data type of output iterator + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_items(num_items), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block block to reduce in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke single_reduce_sweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_in, + d_out, + num_items, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation (two-pass) + //------------------------------------------------------------------------------ + + /// Invoke two-passes to reduce + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void) reduce_kernel; + (void) single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular kernel configuration + KernelConfig reduce_config; + if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; + int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); + GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); + + // Temporary storage allocation requirements + void* allocations[1]; + size_t allocation_sizes[1] = + { + max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + OutputT *d_block_reductions = (OutputT*) allocations[0]; + + // Get grid size for device_reduce_sweep_kernel + int reduce_grid_size = even_share.grid_size; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_grid_size, + ActivePolicyT::ReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, + reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + reduce_kernel<<>>( + d_in, + d_block_reductions, + num_items, + even_share, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke DeviceReduceSingleTileKernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_block_reductions, + d_out, + reduce_grid_size, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceReduceSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceReduceKernel, + DeviceReduceSingleTileKernel); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, num_items, reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchSegmentedReduce : + DeviceReducePolicy< + typename std::iterator_traits::value_type, + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_reduce_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Init kernel configuration + KernelConfig segmented_reduce_config; + if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + num_segments, + ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, + segmented_reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + segmented_reduce_kernel<<>>( + d_in, + d_out, + d_begin_offsets, + d_end_offsets, + num_segments, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedReduceKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + if (num_segments <= 0) + return cudaSuccess; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, + num_segments, d_begin_offsets, d_end_offsets, + reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh new file mode 100644 index 00000000..6f4837b7 --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -0,0 +1,554 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_reduce_by_key.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicyT tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) +__global__ void DeviceReduceByKeyKernel( + KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op, ///< ValueT reduction operator + OffsetT num_items) ///< Total number of items to select from +{ + // Thread block type for reducing tiles of value segments + typedef AgentReduceByKey< + AgentReduceByKeyPolicyT, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT> + AgentReduceByKeyT; + + // Shared memory for AgentReduceByKey + __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; + + // Process tiles + AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey + */ +template < + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchReduceByKey +{ + //------------------------------------------------------------------------- + // Types and constants + //------------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)), + COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT), + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + //------------------------------------------------------------------------- + // Tuning policies + //------------------------------------------------------------------------- + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM11 + struct Policy110 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + ReduceByKeyPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_by_key_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_config.template Init(); + } + else + { + reduce_by_key_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduce-by-key using the + * specified kernel functions. + */ + template < + typename ScanInitKernelT, ///< Function type of cub::DeviceScanInitKernel + typename ReduceByKeyKernelT> ///< Function type of cub::DeviceReduceByKeyKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel + KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_keys_in; + (void)d_unique_out; + (void)d_values_in; + (void)d_aggregates_out; + (void)d_num_runs_out; + (void)equality_op; + (void)reduction_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)reduce_by_key_kernel; + (void)reduce_by_key_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for reduce_by_key_kernel + int reduce_by_key_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_by_key_sm_occupancy, // out + reduce_by_key_kernel, + reduce_by_key_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log reduce_by_key_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); + + // Invoke reduce_by_key_kernel + reduce_by_key_kernel<<>>( + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + tile_state, + start_tile, + equality_op, + reduction_op, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_by_key_config; + InitConfigs(ptx_version, reduce_by_key_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + equality_op, + reduction_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceReduceByKeyKernel, + reduce_by_key_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_rle.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_rle.cuh new file mode 100644 index 00000000..98c3681f --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_rle.cuh @@ -0,0 +1,538 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_rle.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) +__global__ void DeviceRleSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentRle< + AgentRlePolicyT, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + EqualityOpT, + OffsetT> AgentRleT; + + // Shared memory for AgentRle + __shared__ typename AgentRleT::TempStorage temp_storage; + + // Process tiles + AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_runs_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceRle + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRleDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The input value type + typedef typename std::iterator_traits::value_type T; + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig& device_rle_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 300) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 200) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 130) + { + device_rle_config.template Init(); + } + else + { + device_rle_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool store_warp_time_slicing; + BlockScanAlgorithm scan_algorithm; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = AgentRlePolicyT::BLOCK_THREADS; + items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; + load_policy = AgentRlePolicyT::LOAD_ALGORITHM; + store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; + scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_warp_time_slicing, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide run-length-encode using the + * specified kernel functions. + */ + template < + typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel + typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel + KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log device_scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors + device_scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for device_rle_sweep_kernel + int device_rle_kernel_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + device_rle_kernel_sm_occupancy, // out + device_rle_sweep_kernel, + device_rle_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log device_rle_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); + + // Invoke device_rle_sweep_kernel + device_rle_sweep_kernel<<>>( + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + tile_status, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig device_rle_config; + InitConfigs(ptx_version, device_rle_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceRleSweepKernel, + device_rle_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_scan.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_scan.cuh new file mode 100644 index 00000000..3ef720a4 --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_scan.cuh @@ -0,0 +1,563 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_arch.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT> ///< Tile status interface type +__global__ void DeviceScanInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); +} + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT, ///< Tile status interface type + typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected +__global__ void DeviceCompactInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles, ///< [in] Number of tiles + NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); + + // Initialize d_num_selected_out + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + *d_num_selected_out = 0; +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanTileStateT, ///< Tile status interface type + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) +__global__ void DeviceScanKernel( + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + ScanOpT scan_op, ///< Binary scan functor + InitValueT init_value, ///< Initial value to seed the exclusive scan + OffsetT num_items) ///< Total number of scan items for the entire problem +{ + // Thread block type for scanning input tiles + typedef AgentScan< + ScanPolicyT, + InputIteratorT, + OutputIteratorT, + ScanOpT, + InitValueT, + OffsetT> AgentScanT; + + // Shared memory for AgentScan + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Process tiles + AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchScan +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM600 + struct Policy600 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 15, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM520 + struct Policy520 + { + // Titan X: 32.47B items/s @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM35 + struct Policy350 + { + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING> + ScanPolicyT; + }; + + /// SM30 + struct Policy300 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(256, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM20 + struct Policy200 + { + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM13 + struct Policy130 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(96, 21, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanPolicyT; + }; + + /// SM10 + struct Policy100 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(64, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 520) + typedef Policy520 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_kernel_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_kernel_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 520) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 350) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_kernel_config.template Init(); + } + else + { + scan_kernel_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename ScanSweepKernelPtrT> ///< Function type of cub::DeviceScanKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel + KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_out; + (void)scan_op; + (void)init_value; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)scan_kernel; + (void)scan_kernel_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_sm_occupancy, // out + scan_kernel, + scan_kernel_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); + + // Invoke scan_kernel + scan_kernel<<>>( + d_in, + d_out, + tile_state, + start_tile, + scan_op, + init_value, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Get kernel kernel dispatch configurations + KernelConfig scan_kernel_config; + InitConfigs(ptx_version, scan_kernel_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceScanInitKernel, + DeviceScanKernel, + scan_kernel_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_select_if.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_select_if.cuh new file mode 100644 index 00000000..60b33133 --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_select_if.cuh @@ -0,0 +1,542 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_select_if.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename ScanTileStateT, ///< Tile status interface type + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) +__global__ void DeviceSelectSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentSelectIf< + AgentSelectIfPolicyT, + InputIteratorT, + FlagsInputIteratorT, + SelectedOutputIteratorT, + SelectOpT, + EqualityOpT, + OffsetT, + KEEP_REJECTS> AgentSelectIfT; + + // Shared memory for AgentSelectIf + __shared__ typename AgentSelectIfT::TempStorage temp_storage; + + // Process tiles + AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_selected_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct DispatchSelectIf +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 10, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectIfPolicyT; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + SelectIfPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &select_if_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_if_config.template Init(); + } + else if (ptx_version >= 300) + { + select_if_config.template Init(); + } + else if (ptx_version >= 200) + { + select_if_config.template Init(); + } + else if (ptx_version >= 130) + { + select_if_config.template Init(); + } + else + { + select_if_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide selection using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel + KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_flags; + (void)d_selected_out; + (void)d_num_selected_out; + (void)select_op; + (void)equality_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)scan_init_kernel; + (void)select_if_kernel; + (void)select_if_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke scan_init_kernel to initialize tile descriptors + scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_selected_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for select_if_kernel + int range_select_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + range_select_sm_occupancy, // out + select_if_kernel, + select_if_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log select_if_kernel configuration + if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); + + // Invoke select_if_kernel + select_if_kernel<<>>( + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + tile_status, + select_op, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig select_if_config; + InitConfigs(ptx_version, select_if_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + select_op, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceSelectSweepKernel, + select_if_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/device/dispatch/dispatch_spmv_orig.cuh b/dnn/src/cuda/cub/device/dispatch/dispatch_spmv_orig.cuh new file mode 100644 index 00000000..ab9c5346 --- /dev/null +++ b/dnn/src/cuda/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -0,0 +1,834 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include + +#include "../../agent/single_pass_scan_operators.cuh" +#include "../../agent/agent_segment_fixup.cuh" +#include "../../agent/agent_spmv_orig.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * SpMV kernel entry points + *****************************************************************************/ + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +__global__ void DeviceSpmv1ColKernel( + SpmvParams spmv_params) ///< [in] SpMV input parameter bundle +{ + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); + + int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (row_idx < spmv_params.num_rows) + { + OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; + OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; + + ValueT value = 0.0; + if (end_nonzero_idx != nonzero_idx) + { + value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; + } + + spmv_params.d_vector_y[row_idx] = value; + } +} + + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + typename SpmvParamsT> ///< SpmvParams type +__global__ void DeviceSpmvSearchKernel( + int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) + CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates + SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle +{ + /// Constants + enum + { + BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + typedef CacheModifiedInputIterator< + SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + // Find the starting coordinate for all tiles (plus the end coordinate of the last one) + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_merge_tiles + 1) + { + OffsetT diagonal = (tile_idx * TILE_ITEMS); + CoordinateT tile_coordinate; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coordinate); + + // Output starting offset + d_tile_coordinates[tile_idx] = tile_coordinate; + } +} + + +/** + * Spmv agent entry point + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ScanTileStateT, ///< Tile status interface type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 + bool HAS_BETA> ///< Whether the input parameter Beta is 0 +__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) +__global__ void DeviceSpmvKernel( + SpmvParams spmv_params, ///< [in] SpMV input parameter bundle + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_tiles, ///< [in] Number of merge tiles + ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel + int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) +{ + // Spmv agent type specialization + typedef AgentSpmv< + SpmvPolicyT, + ValueT, + OffsetT, + HAS_ALPHA, + HAS_BETA> + AgentSpmvT; + + // Shared memory for AgentSpmv + __shared__ typename AgentSpmvT::TempStorage temp_storage; + + AgentSpmvT(temp_storage, spmv_params).ConsumeTile( + d_tile_coordinates, + d_tile_carry_pairs, + num_tiles); + + // Initialize fixup tile status + tile_state.InitializeStatus(num_segment_fixup_tiles); + +} + + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename OffsetT, ///< Signed integer type for global offsets + typename ScanTileStateT> ///< Tile status interface type +__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) +__global__ void DeviceSegmentFixupKernel( + PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block + AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates + OffsetT num_items, ///< [in] Total number of items to select from + int num_tiles, ///< [in] Total number of tiles for the entire problem + ScanTileStateT tile_state) ///< [in] Tile status interface +{ + // Thread block type for reducing tiles of value segments + typedef AgentSegmentFixup< + AgentSegmentFixupPolicyT, + PairsInputIteratorT, + AggregatesOutputIteratorT, + cub::Equality, + cub::Sum, + OffsetT> + AgentSegmentFixupT; + + // Shared memory for AgentSegmentFixup + __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; + + // Process tiles + AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( + num_items, + num_tiles, + tile_state); +} + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv + */ +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSpmv +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // SpmvParams bundle type + typedef SpmvParams SpmvParamsT; + + // 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM11 + struct Policy110 + { + typedef AgentSpmvPolicy< + 128, + 1, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM20 + struct Policy200 + { + typedef AgentSpmvPolicy< + 96, + 18, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_RAKING> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + + /// SM30 + struct Policy300 + { + typedef AgentSpmvPolicy< + 96, + 6, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + /// SM35 + struct Policy350 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 96 : 128, + (sizeof(ValueT) > 4) ? 4 : 7, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + /// SM37 + struct Policy370 + { + + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 128 : 128, + (sizeof(ValueT) > 4) ? 9 : 14, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM50 + struct Policy500 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 6 : 7, + LOAD_LDG, + LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_RAKING_MEMOIZE> + SegmentFixupPolicyT; + }; + + + /// SM60 + struct Policy600 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 5 : 7, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 370) + typedef Policy370 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; + struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &spmv_config, + KernelConfig &segment_fixup_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 500) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 370) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 350) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 300) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + + } + else if (ptx_version >= 200) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduction using the + * specified kernel functions. + * + * If the input is larger than a single tile, this method uses two-passes of + * kernel invocations. + */ + template < + typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel + typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel + typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel + typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel + SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel + SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel + SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel + KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for + KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + cudaError error = cudaSuccess; + do + { + if (spmv_params.num_cols == 1) + { + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + temp_storage_bytes = 1; + break; + } + + // Get search/init grid dims + int degen_col_kernel_block_size = INIT_KERNEL_THREADS; + int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; + + if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", + degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_1col_kernel<<>>( + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + break; + } + + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Total number of spmv work items + int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; + + // Tile sizes of kernels + int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; + int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; + + // Number of tiles for kernels + unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; + unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; + + // Get SM occupancy for kernels + int spmv_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + spmv_sm_occupancy, + spmv_kernel, + spmv_config.block_threads))) break; + + int segment_fixup_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + segment_fixup_sm_occupancy, + segment_fixup_kernel, + segment_fixup_config.block_threads))) break; + + // Get grid dimensions + dim3 spmv_grid_size( + CUB_MIN(num_merge_tiles, max_dim_x), + (num_merge_tiles + max_dim_x - 1) / max_dim_x, + 1); + + dim3 segment_fixup_grid_size( + CUB_MIN(num_segment_fixup_tiles, max_dim_x), + (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, + 1); + + // Get the temporary storage allocation requirements + size_t allocation_sizes[3]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors + allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs + allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + void* allocations[3]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; + + // Alias the other allocations + KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs + CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates + + // Get search/init grid dims + int search_block_size = INIT_KERNEL_THREADS; + int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; + +#if (CUB_PTX_ARCH == 0) + // Init textures + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; +#endif + + if (search_grid_size < sm_count) +// if (num_merge_tiles < spmv_sm_occupancy * sm_count) + { + // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords + d_tile_coordinates = NULL; + } + else + { + // Use separate search kernel if we have enough spmv tiles to saturate the device + + // Log spmv_search_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", + search_grid_size, search_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_search_kernel<<>>( + num_merge_tiles, + d_tile_coordinates, + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log spmv_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); + + // Invoke spmv_kernel + spmv_kernel<<>>( + spmv_params, + d_tile_coordinates, + d_tile_carry_pairs, + num_merge_tiles, + tile_state, + num_segment_fixup_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Run reduce-by-key fixup if necessary + if (num_merge_tiles > 1) + { + // Log segment_fixup_kernel configuration + if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); + + // Invoke segment_fixup_kernel + segment_fixup_kernel<<>>( + d_tile_carry_pairs, + spmv_params.d_vector_y, + num_merge_tiles, + num_segment_fixup_tiles, + tile_state); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + +#if (CUB_PTX_ARCH == 0) + // Free textures + if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; +#endif + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig spmv_config, segment_fixup_config; + InitConfigs(ptx_version, spmv_config, segment_fixup_config); + + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmv1ColKernel, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/grid/grid_barrier.cuh b/dnn/src/cuda/cub/grid/grid_barrier.cuh new file mode 100644 index 00000000..461fb442 --- /dev/null +++ b/dnn/src/cuda/cub/grid/grid_barrier.cuh @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ + +#pragma once + +#include "../util_debug.cuh" +#include "../util_namespace.cuh" +#include "../thread/thread_load.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ +class GridBarrier +{ +protected : + + typedef unsigned int SyncFlag; + + // Counters in global device memory + SyncFlag* d_sync; + +public: + + /** + * Constructor + */ + GridBarrier() : d_sync(NULL) {} + + + /** + * Synchronize + */ + __device__ __forceinline__ void Sync() const + { + volatile SyncFlag *d_vol_sync = d_sync; + + // Threadfence and syncthreads to make sure global writes are visible before + // thread-0 reports in with its sync counter + __threadfence(); + CTA_SYNC(); + + if (blockIdx.x == 0) + { + // Report in ourselves + if (threadIdx.x == 0) + { + d_vol_sync[blockIdx.x] = 1; + } + + CTA_SYNC(); + + // Wait for everyone else to report in + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + while (ThreadLoad(d_sync + peer_block) == 0) + { + __threadfence_block(); + } + } + + CTA_SYNC(); + + // Let everyone know it's safe to proceed + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + d_vol_sync[peer_block] = 0; + } + } + else + { + if (threadIdx.x == 0) + { + // Report in + d_vol_sync[blockIdx.x] = 1; + + // Wait for acknowledgment + while (ThreadLoad(d_sync + blockIdx.x) == 1) + { + __threadfence_block(); + } + } + + CTA_SYNC(); + } + } +}; + + +/** + * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. + * + * Uses RAII for lifetime, i.e., device resources are reclaimed when + * the destructor is called. + */ +class GridBarrierLifetime : public GridBarrier +{ +protected: + + // Number of bytes backed by d_sync + size_t sync_bytes; + +public: + + /** + * Constructor + */ + GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} + + + /** + * DeviceFrees and resets the progress counters + */ + cudaError_t HostReset() + { + cudaError_t retval = cudaSuccess; + if (d_sync) + { + CubDebug(retval = cudaFree(d_sync)); + d_sync = NULL; + } + sync_bytes = 0; + return retval; + } + + + /** + * Destructor + */ + virtual ~GridBarrierLifetime() + { + HostReset(); + } + + + /** + * Sets up the progress counters for the next kernel launch (lazily + * allocating and initializing them if necessary) + */ + cudaError_t Setup(int sweep_grid_size) + { + cudaError_t retval = cudaSuccess; + do { + size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); + if (new_sync_bytes > sync_bytes) + { + if (d_sync) + { + if (CubDebug(retval = cudaFree(d_sync))) break; + } + + sync_bytes = new_sync_bytes; + + // Allocate and initialize to zero + if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; + if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; + } + } while (0); + + return retval; + } +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/grid/grid_even_share.cuh b/dnn/src/cuda/cub/grid/grid_even_share.cuh new file mode 100644 index 00000000..f0b3a69a --- /dev/null +++ b/dnn/src/cuda/cub/grid/grid_even_share.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). + */ + + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" +#include "grid_mapping.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridEvenShare is a descriptor utility for distributing input among + * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly + * the same number of input tiles. + * + * \par Overview + * Each thread block is assigned a consecutive sequence of input tiles. To help + * preserve alignment and eliminate the overhead of guarded loads for all but the + * last thread block, to GridEvenShare assigns one of three different amounts of + * work to a given thread block: "big", "normal", or "last". The "big" workloads + * are one scheduling grain larger than "normal". The "last" work unit for the + * last thread block may be partially-full if the input is not an even multiple of + * the scheduling grain size. + * + * \par + * Before invoking a child grid, a parent thread will typically construct an + * instance of GridEvenShare. The instance can be passed to child thread blocks + * which can initialize their per-thread block offsets using \p BlockInit(). + */ +template +struct GridEvenShare +{ +private: + + OffsetT total_tiles; + int big_shares; + OffsetT big_share_items; + OffsetT normal_share_items; + OffsetT normal_base_offset; + +public: + + /// Total number of input items + OffsetT num_items; + + /// Grid size in thread blocks + int grid_size; + + /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles + OffsetT block_offset; + + /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles + OffsetT block_end; + + /// Stride between input tiles + OffsetT block_stride; + + + /** + * \brief Constructor. + */ + __host__ __device__ __forceinline__ GridEvenShare() : + total_tiles(0), + big_shares(0), + big_share_items(0), + normal_share_items(0), + normal_base_offset(0), + num_items(0), + grid_size(0), + block_offset(0), + block_end(0), + block_stride(0) + {} + + + /** + * \brief Dispatch initializer. To be called prior prior to kernel launch. + */ + __host__ __device__ __forceinline__ void DispatchInit( + OffsetT num_items, ///< Total number of input items + int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) + int tile_items) ///< Number of data items per input tile + { + this->block_offset = num_items; // Initialize past-the-end + this->block_end = num_items; // Initialize past-the-end + this->num_items = num_items; + this->total_tiles = (num_items + tile_items - 1) / tile_items; + this->grid_size = CUB_MIN(total_tiles, max_grid_size); + OffsetT avg_tiles_per_block = total_tiles / grid_size; + this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks + this->normal_share_items = avg_tiles_per_block * tile_items; + this->normal_base_offset = big_shares * tile_items; + this->big_share_items = normal_share_items + tile_items; + } + + + /** + * \brief Initializes ranges for the specified thread block index. Specialized + * for a "raking" access pattern in which each thread block is assigned a + * consecutive sequence of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = TILE_ITEMS; + if (block_id < big_shares) + { + // This thread block gets a big share of grains (avg_tiles_per_block + 1) + block_offset = (block_id * big_share_items); + block_end = block_offset + big_share_items; + } + else if (block_id < total_tiles) + { + // This thread block gets a normal share of grains (avg_tiles_per_block) + block_offset = normal_base_offset + (block_id * normal_share_items); + block_end = CUB_MIN(num_items, block_offset + normal_share_items); + } + // Else default past-the-end + } + + + /** + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = grid_size * TILE_ITEMS; + block_offset = (block_id * TILE_ITEMS); + block_end = num_items; + } + + + /** + * \brief Block-initialization, specialized for "strip mining" access + * pattern in which the input tiles assigned to each thread block are + * separated by a stride equal to the the extent of the grid. + */ + template < + int TILE_ITEMS, + GridMappingStrategy STRATEGY> + __device__ __forceinline__ void BlockInit() + { + BlockInit(blockIdx.x, Int2Type()); + } + + + /** + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + { + this->block_offset = block_offset; + this->block_end = block_end; + this->block_stride = TILE_ITEMS; + } + + +}; + + + + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/grid/grid_mapping.cuh b/dnn/src/cuda/cub/grid/grid_mapping.cuh new file mode 100644 index 00000000..f0e9fded --- /dev/null +++ b/dnn/src/cuda/cub/grid/grid_mapping.cuh @@ -0,0 +1,113 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/****************************************************************************** + * Mapping policies + *****************************************************************************/ + + +/** + * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ +enum GridMappingStrategy +{ + /** + * \brief An a "raking" access pattern in which each thread block is + * assigned a consecutive sequence of input tiles + * + * \par Overview + * The input is evenly partitioned into \p p segments, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each segment is comprised of + * consecutive tiles, where a tile is a small, constant-sized unit of input + * to be processed to completion before the thread block terminates or + * obtains more work. The kernel invokes \p p thread blocks, each + * of which iteratively consumes a segment of n/p elements + * in tile-size increments. + */ + GRID_MAPPING_RAKE, + + /** + * \brief An a "strip mining" access pattern in which the input tiles assigned + * to each thread block are separated by a stride equal to the the extent of + * the grid. + * + * \par Overview + * The input is evenly partitioned into \p p sets, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each set is comprised of + * data tiles separated by stride \p tiles, where a tile is a small, + * constant-sized unit of input to be processed to completion before the + * thread block terminates or obtains more work. The kernel invokes \p p + * thread blocks, each of which iteratively consumes a segment of + * n/p elements in tile-size increments. + */ + GRID_MAPPING_STRIP_MINE, + + /** + * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. + * + * \par Overview + * The input is treated as a queue to be dynamically consumed by a grid of + * thread blocks. Work is atomically dequeued in tiles, where a tile is a + * unit of input to be processed to completion before the thread block + * terminates or obtains more work. The grid size \p p is constant, + * loosely corresponding to the number of thread blocks that may actively + * reside on the target device. + */ + GRID_MAPPING_DYNAMIC, +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/grid/grid_queue.cuh b/dnn/src/cuda/cub/grid/grid_queue.cuh new file mode 100644 index 00000000..9615b14d --- /dev/null +++ b/dnn/src/cuda/cub/grid/grid_queue.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridQueue is a descriptor utility for dynamic queue management. + */ + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_debug.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridQueue is a descriptor utility for dynamic queue management. + * + * \par Overview + * GridQueue descriptors provides abstractions for "filling" or + * "draining" globally-shared vectors. + * + * \par + * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, + * returning a unique offset for the calling thread to write its items. + * The GridQueue maintains the total "fill-size". The fill counter must be reset + * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that + * will be filling. + * + * \par + * Similarly, a "draining" GridQueue works by works by atomically-incrementing a + * zero-initialized counter, returning a unique offset for the calling thread to + * read its items. Threads can safely drain until the array's logical fill-size is + * exceeded. The drain counter must be reset using GridQueue::ResetDrain or + * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that + * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size + * is simply the number of elements in the array.) + * + * \par + * Iterative work management can be implemented simply with a pair of flip-flopping + * work buffers, each with an associated set of fill and drain GridQueue descriptors. + * + * \tparam OffsetT Signed integer type for global offsets + */ +template +class GridQueue +{ +private: + + /// Counter indices + enum + { + FILL = 0, + DRAIN = 1, + }; + + /// Pair of counters + OffsetT *d_counters; + +public: + + /// Returns the device allocation size in bytes needed to construct a GridQueue instance + __host__ __device__ __forceinline__ + static size_t AllocationSize() + { + return sizeof(OffsetT) * 2; + } + + + /// Constructs an invalid GridQueue descriptor + __host__ __device__ __forceinline__ GridQueue() + : + d_counters(NULL) + {} + + + /// Constructs a GridQueue descriptor around the device storage allocation + __host__ __device__ __forceinline__ GridQueue( + void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). + : + d_counters((OffsetT*) d_storage) + {} + + + /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( + OffsetT fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[FILL] = fill_size; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + OffsetT counters[2]; + counters[FILL] = fill_size; + counters[DRAIN] = 0; + return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); +#endif + } + + + /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); +#endif + } + + + /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. + __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[FILL] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); +#endif + } + + + /// Returns the fill-size established by the parent or by the previous kernel. + __host__ __device__ __forceinline__ cudaError_t FillSize( + OffsetT &fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + fill_size = d_counters[FILL]; + return cudaSuccess; +#else + return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); +#endif + } + + + /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Drain(OffsetT num_items) + { + return atomicAdd(d_counters + DRAIN, num_items); + } + + + /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Fill(OffsetT num_items) + { + return atomicAdd(d_counters + FILL, num_items); + } +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Reset grid queue (call with 1 block of 1 thread) + */ +template +__global__ void FillAndResetDrainKernel( + GridQueue grid_queue, + OffsetT num_items) +{ + grid_queue.FillAndResetDrain(num_items); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/dnn/src/cuda/cub/host/mutex.cuh b/dnn/src/cuda/cub/host/mutex.cuh new file mode 100644 index 00000000..ff7ec90d --- /dev/null +++ b/dnn/src/cuda/cub/host/mutex.cuh @@ -0,0 +1,171 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple portable mutex + */ + + +#pragma once + +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + #include +#else + #if defined(_WIN32) || defined(_WIN64) + #include + + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #undef WIN32_LEAN_AND_MEAN + #undef NOMINMAX + + /** + * Compiler read/write barrier + */ + #pragma intrinsic(_ReadWriteBarrier) + + #endif +#endif + +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Simple portable mutex + * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) + * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) + */ +struct Mutex +{ +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + + std::mutex mtx; + + void Lock() + { + mtx.lock(); + } + + void Unlock() + { + mtx.unlock(); + } + + void TryLock() + { + mtx.try_lock(); + } + +#else //__cplusplus > 199711L + + #if defined(_MSC_VER) + + // Microsoft VC++ + typedef long Spinlock; + + #else + + // GNU g++ + typedef int Spinlock; + + /** + * Compiler read/write barrier + */ + __forceinline__ void _ReadWriteBarrier() + { + __sync_synchronize(); + } + + /** + * Atomic exchange + */ + __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) + { + // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier + _ReadWriteBarrier(); + return __sync_lock_test_and_set(Target, Value); + } + + /** + * Pause instruction to prevent excess processor bus usage + */ + __forceinline__ void YieldProcessor() + { + } + + #endif // defined(_MSC_VER) + + /// Lock member + volatile Spinlock lock; + + /** + * Constructor + */ + Mutex() : lock(0) {} + + /** + * Return when the specified spinlock has been acquired + */ + __forceinline__ void Lock() + { + while (1) + { + if (!_InterlockedExchange(&lock, 1)) return; + while (lock) YieldProcessor(); + } + } + + + /** + * Release the specified spinlock + */ + __forceinline__ void Unlock() + { + _ReadWriteBarrier(); + lock = 0; + } + +#endif // __cplusplus > 199711L + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/dnn/src/cuda/cub/iterator/arg_index_input_iterator.cuh b/dnn/src/cuda/cub/iterator/arg_index_input_iterator.cuh new file mode 100644 index 00000000..95a84a57 --- /dev/null +++ b/dnn/src/cuda/cub/iterator/arg_index_input_iterator.cuh @@ -0,0 +1,259 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#include + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). + * + * \par Overview + * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. + * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose + * \p key field is \p i and whose \p value field is itr[i]. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto + * dereference an array of doubles + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::ArgIndexInputIterator itr(d_in); + * + * // Within device code: + * typedef typename cub::ArgIndexInputIterator::value_type Tuple; + * Tuple item_offset_pair.key = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.key); // 8.0 @ 0 + * + * itr = itr + 6; + * item_offset_pair.key = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.key); // 9.0 @ 6 + * + * \endcode + * + * \tparam InputIteratorT The value type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) + */ +template < + typename InputIteratorT, + typename OffsetT = ptrdiff_t, + typename OutputValueT = typename std::iterator_traits::value_type> +class ArgIndexInputIterator +{ +public: + + // Required iterator traits + typedef ArgIndexInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef KeyValuePair value_type; ///< The type of the element the iterator can point to + typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to + typedef value_type reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + InputIteratorT itr; + difference_type offset; + +public: + + /// Constructor + __host__ __device__ __forceinline__ ArgIndexInputIterator( + InputIteratorT itr, ///< Input iterator to wrap + difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator + : + itr(itr), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + value_type retval; + retval.value = itr[offset]; + retval.key = offset; + return retval; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(itr, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(itr, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((itr == rhs.itr) && (offset == rhs.offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((itr != rhs.itr) || (offset != rhs.offset)); + } + + /// Normalize + __host__ __device__ __forceinline__ void normalize() + { + itr += offset; + offset = 0; + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/cache_modified_input_iterator.cuh b/dnn/src/cuda/cub/iterator/cache_modified_input_iterator.cuh new file mode 100644 index 00000000..b4ad91e2 --- /dev/null +++ b/dnn/src/cuda/cub/iterator/cache_modified_input_iterator.cuh @@ -0,0 +1,240 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. + * + * \par Overview + * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by reading \p ValueType values through loads modified by \p MODIFIER. + * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", + * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto + * dereference a device array of double using the "ldg" PTX load modifier + * (i.e., load values through texture cache). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::CacheModifiedInputIterator itr(d_in); + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * \endcode + * + * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename OffsetT = ptrdiff_t> +class CacheModifiedInputIterator +{ +public: + + // Required iterator traits + typedef CacheModifiedInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + + +public: + + /// Wrapped native pointer + ValueType* ptr; + + /// Constructor + template + __host__ __device__ __forceinline__ CacheModifiedInputIterator( + QualifiedValueType* ptr) ///< Native pointer to wrap + : + ptr(const_cast::Type *>(ptr)) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __device__ __forceinline__ reference operator*() const + { + return ThreadLoad(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __device__ __forceinline__ reference operator[](Distance n) const + { + return ThreadLoad(ptr + n); + } + + /// Structure dereference + __device__ __forceinline__ pointer operator->() + { + return &ThreadLoad(ptr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/cache_modified_output_iterator.cuh b/dnn/src/cuda/cub/iterator/cache_modified_output_iterator.cuh new file mode 100644 index 00000000..c3e3321d --- /dev/null +++ b/dnn/src/cuda/cub/iterator/cache_modified_output_iterator.cuh @@ -0,0 +1,254 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. + * + * \par Overview + * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by writing \p ValueType values through stores modified by \p MODIFIER. + * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", + * "STORE_CG", "STORE_CS", "STORE_WT", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to + * dereference a device array of doubles using the "wt" PTX load modifier + * (i.e., write-through to system memory). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_out; // e.g., [, , , , , , ] + * + * // Create an iterator wrapper + * cub::CacheModifiedOutputIterator itr(d_out); + * + * // Within device code: + * itr[0] = 8.0; + * itr[1] = 66.0; + * itr[55] = 24.0; + * + * \endcode + * + * \par Usage Considerations + * - Can only be dereferenced within device code + * + * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheStoreModifier MODIFIER, + typename ValueType, + typename OffsetT = ptrdiff_t> +class CacheModifiedOutputIterator +{ +private: + + // Proxy object + struct Reference + { + ValueType* ptr; + + /// Constructor + __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} + + /// Assignment + __device__ __forceinline__ ValueType operator =(ValueType val) + { + ThreadStore(ptr, val); + return val; + } + }; + +public: + + // Required iterator traits + typedef CacheModifiedOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef Reference reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType* ptr; + +public: + + /// Constructor + template + __host__ __device__ __forceinline__ CacheModifiedOutputIterator( + QualifiedValueType* ptr) ///< Native pointer to wrap + : + ptr(const_cast::Type *>(ptr)) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return Reference(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return Reference(ptr + n); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/constant_input_iterator.cuh b/dnn/src/cuda/cub/iterator/constant_input_iterator.cuh new file mode 100644 index 00000000..1e0a9104 --- /dev/null +++ b/dnn/src/cuda/cub/iterator/constant_input_iterator.cuh @@ -0,0 +1,235 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input generator for dereferencing a sequence of homogeneous values + * + * \par Overview + * - Read references to a ConstantInputIteratorTiterator always return the supplied constant + * of type \p ValueType. + * - Can be used with any data type. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ConstantInputIteratorTto + * dereference a sequence of homogeneous doubles. + * \par + * \code + * #include // or equivalently + * + * cub::ConstantInputIterator itr(5.0); + * + * printf("%f\n", itr[0]); // 5.0 + * printf("%f\n", itr[1]); // 5.0 + * printf("%f\n", itr[2]); // 5.0 + * printf("%f\n", itr[50]); // 5.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename OffsetT = ptrdiff_t> +class ConstantInputIterator +{ +public: + + // Required iterator traits + typedef ConstantInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + OffsetT offset; +#ifdef _WIN32 + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ ConstantInputIterator( + ValueType val, ///< Starting value for the iterator instance to report + OffsetT offset = 0) ///< Base offset + : + val(val), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const + { + return val; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset) && ((val == rhs.val)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset) || (val!= rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "," << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/counting_input_iterator.cuh b/dnn/src/cuda/cub/iterator/counting_input_iterator.cuh new file mode 100644 index 00000000..7f49348d --- /dev/null +++ b/dnn/src/cuda/cub/iterator/counting_input_iterator.cuh @@ -0,0 +1,228 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + +/** + * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. + * + * \par Overview + * - After initializing a CountingInputIteratorTto a certain integer \p base, read references + * at \p offset will return the value \p base + \p offset. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CountingInputIteratorTto + * dereference a sequence of incrementing integers. + * \par + * \code + * #include // or equivalently + * + * cub::CountingInputIterator itr(5); + * + * printf("%d\n", itr[0]); // 5 + * printf("%d\n", itr[1]); // 6 + * printf("%d\n", itr[2]); // 7 + * printf("%d\n", itr[50]); // 55 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename OffsetT = ptrdiff_t> +class CountingInputIterator +{ +public: + + // Required iterator traits + typedef CountingInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + +public: + + /// Constructor + __host__ __device__ __forceinline__ CountingInputIterator( + const ValueType &val) ///< Starting value for the iterator instance to report + : + val(val) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + val++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + val++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val + (ValueType) n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + val += (ValueType) n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val - (ValueType) n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + val -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return (difference_type) (val - other.val); + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return val + (ValueType) n; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (val == rhs.val); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (val != rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "]"; + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/discard_output_iterator.cuh b/dnn/src/cuda/cub/iterator/discard_output_iterator.cuh new file mode 100644 index 00000000..28473e5f --- /dev/null +++ b/dnn/src/cuda/cub/iterator/discard_output_iterator.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A discard iterator + */ +template +class DiscardOutputIterator +{ +public: + + // Required iterator traits + typedef DiscardOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef void reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + OffsetT offset; + +#if defined(_WIN32) || !defined(_WIN64) + // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ DiscardOutputIterator( + OffsetT offset = 0) ///< Base offset + : + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ self_type& operator*() + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ self_type& operator[](Distance n) + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return; + } + + /// Assignment to self (no-op) + __host__ __device__ __forceinline__ void operator=(self_type const& other) + { + offset = other.offset; + } + + /// Assignment to anything else (no-op) + template + __host__ __device__ __forceinline__ void operator=(T const&) + {} + + /// Cast to void* operator + __host__ __device__ __forceinline__ operator void*() const { return NULL; } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/tex_obj_input_iterator.cuh b/dnn/src/cuda/cub/iterator/tex_obj_input_iterator.cuh new file mode 100644 index 00000000..b99103ec --- /dev/null +++ b/dnn/src/cuda/cub/iterator/tex_obj_input_iterator.cuh @@ -0,0 +1,310 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. + * + * \par Overview + * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be + * created by the host thread, but can be used by any descendant kernel. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIteratorTto + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexObjInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + typename OffsetT = ptrdiff_t> +class TexObjInputIterator +{ +public: + + // Required iterator traits + typedef TexObjInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + // Largest texture word we can use in device + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + +private: + + T* ptr; + difference_type tex_offset; + cudaTextureObject_t tex_obj; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TexObjInputIterator() + : + ptr(NULL), + tex_offset(0), + tex_obj(0) + {} + + /// Use this iterator to bind \p ptr with a texture reference + template + cudaError_t BindTexture( + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + { + this->ptr = const_cast::Type *>(ptr); + this->tex_offset = tex_offset; + + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + cudaResourceDesc res_desc; + cudaTextureDesc tex_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = this->ptr; + res_desc.res.linear.desc = channel_desc; + res_desc.res.linear.sizeInBytes = bytes; + tex_desc.readMode = cudaReadModeElementType; + return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return cudaDestroyTextureObject(tex_obj); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Move array of uninitialized words, then alias and assign to return value + TextureWord words[TEXTURE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch( + tex_obj, + (tex_offset * TEXTURE_MULTIPLE) + i); + } + + // Load from words + return *reinterpret_cast(words); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/iterator/tex_ref_input_iterator.cuh b/dnn/src/cuda/cub/iterator/tex_ref_input_iterator.cuh new file mode 100644 index 00000000..95d0ffbc --- /dev/null +++ b/dnn/src/cuda/cub/iterator/tex_ref_input_iterator.cuh @@ -0,0 +1,374 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer + +#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Static file-scope Tesla/Fermi-style texture references + *****************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +// Anonymous namespace +namespace { + +/// Global texture reference specialized by type +template +struct IteratorTexRef +{ + /// And by unique ID + template + struct TexId + { + // Largest texture word we can use in device + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord), + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + + // Texture reference type + typedef texture TexRef; + + // Texture reference + static TexRef ref; + + /// Bind texture + static cudaError_t BindTexture(void *d_in, size_t &offset) + { + if (d_in) + { + cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); + ref.channelDesc = tex_desc; + return (CubDebug(cudaBindTexture(&offset, ref, d_in))); + } + + return cudaSuccess; + } + + /// Unbind texture + static cudaError_t UnbindTexture() + { + return CubDebug(cudaUnbindTexture(ref)); + } + + /// Fetch element + template + static __device__ __forceinline__ T Fetch(Distance tex_offset) + { + DeviceWord temp[DEVICE_MULTIPLE]; + TextureWord *words = reinterpret_cast(temp); + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i); + } + + return reinterpret_cast(temp); + } + }; +}; + +// Texture reference definitions +template +template +typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; + + +} // Anonymous namespace + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. + * + * \par Overview + * - TexRefInputIteratorTwraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture + * reference. Only one TexRefInputIteratorTinstance can be bound at any given time for a + * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host + * thread, and (4) compilation .o unit. + * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be + * created by the host thread and used by a top-level kernel (i.e. the one which is launched + * from the host). + * - Compatible with Thrust API v1.7 or newer. + * - Compatible with CUDA toolkit v5.5 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIteratorTto + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexRefInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + int UNIQUE_ID, + typename OffsetT = ptrdiff_t> +class TexRefInputIterator +{ +public: + + // Required iterator traits + typedef TexRefInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + T* ptr; + difference_type tex_offset; + + // Texture reference wrapper (old Tesla/Fermi-style textures) + typedef typename IteratorTexRef::template TexId TexId; + +public: +/* + /// Constructor + __host__ __device__ __forceinline__ TexRefInputIterator() + : + ptr(NULL), + tex_offset(0) + {} +*/ + /// Use this iterator to bind \p ptr with a texture reference + template + cudaError_t BindTexture( + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + { + this->ptr = const_cast::Type *>(ptr); + size_t offset; + cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); + this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); + return retval; + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return TexId::UnbindTexture(); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Use the texture reference + return TexId::Fetch(tex_offset); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + +#endif // CUDA_VERSION diff --git a/dnn/src/cuda/cub/iterator/transform_input_iterator.cuh b/dnn/src/cuda/cub/iterator/transform_input_iterator.cuh new file mode 100644 index 00000000..dad1f500 --- /dev/null +++ b/dnn/src/cuda/cub/iterator/transform_input_iterator.cuh @@ -0,0 +1,252 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for transforming dereferenced values. + * + * \par Overview + * - TransformInputIteratorTwraps a unary conversion functor of type \p + * ConversionOp and a random-access input iterator of type InputIteratorT, + * using the former to produce references of type \p ValueType from the latter. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TransformInputIteratorTto + * dereference an array of integers, tripling the values and converting them to doubles. + * \par + * \code + * #include // or equivalently + * + * // Functor for tripling integer values and converting to doubles + * struct TripleDoubler + * { + * __host__ __device__ __forceinline__ + * double operator()(const int &a) const { + * return double(a * 3); + * } + * }; + * + * // Declare, allocate, and initialize a device array + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * TripleDoubler conversion_op; + * + * // Create an iterator wrapper + * cub::TransformInputIterator itr(d_in, conversion_op); + * + * // Within device code: + * printf("%f\n", itr[0]); // 24.0 + * printf("%f\n", itr[1]); // 18.0 + * printf("%f\n", itr[6]); // 27.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). + * \tparam InputIteratorT The type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * + */ +template < + typename ValueType, + typename ConversionOp, + typename InputIteratorT, + typename OffsetT = ptrdiff_t> +class TransformInputIterator +{ +public: + + // Required iterator traits + typedef TransformInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ConversionOp conversion_op; + InputIteratorT input_itr; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TransformInputIterator( + InputIteratorT input_itr, ///< Input iterator to wrap + ConversionOp conversion_op) ///< Conversion functor to wrap + : + conversion_op(conversion_op), + input_itr(input_itr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + input_itr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + input_itr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return conversion_op(*input_itr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(input_itr + n, conversion_op); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + input_itr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(input_itr - n, conversion_op); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + input_itr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return input_itr - other.input_itr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return conversion_op(input_itr[n]); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &conversion_op(*input_itr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (input_itr == rhs.input_itr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (input_itr != rhs.input_itr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/thread/thread_load.cuh b/dnn/src/cuda/cub/thread/thread_load.cuh new file mode 100644 index 00000000..b1ca412f --- /dev/null +++ b/dnn/src/cuda/cub/thread/thread_load.cuh @@ -0,0 +1,438 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for reading memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory load operations. + */ +enum CacheLoadModifier +{ + LOAD_DEFAULT, ///< Default (no modifier) + LOAD_CA, ///< Cache at all levels + LOAD_CG, ///< Cache at global level + LOAD_CS, ///< Cache streaming (likely to be accessed once) + LOAD_CV, ///< Cache as volatile (including cached system lines) + LOAD_LDG, ///< Cache as texture + LOAD_VOLATILE, ///< Volatile (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit load using cache-global modifier: + * int *d_in; + * int val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 16-bit load using default modifier + * short *d_in; + * short val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 256-bit load using cache-volatile modifier + * double4 *d_in; + * double4 val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 96-bit load using cache-streaming modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); + * \endcode + * + * \tparam MODIFIER [inferred] CacheLoadModifier enumeration + * \tparam InputIteratorT [inferred] Input iterator type \iterator + */ +template < + CacheLoadModifier MODIFIER, + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated load iteration (inductive case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T const *ptr, T *vals) + { + vals[COUNT] = ThreadLoad(ptr + COUNT); + IterateThreadLoad::template Load(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) + { + vals[COUNT] = itr[COUNT]; + IterateThreadLoad::Dereference(itr, vals); + } +}; + + +/// Helper structure for templated load iteration (termination case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} + + template + static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} +}; + + +/** + * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ + { \ + uint4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ + "=r"(retval.x), \ + "=r"(retval.y), \ + "=r"(retval.z), \ + "=r"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ + { \ + ulonglong2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ + "=l"(retval.x), \ + "=l"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ + { \ + ushort4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ + "=h"(retval.x), \ + "=h"(retval.y), \ + "=h"(retval.z), \ + "=h"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ + { \ + uint2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ + "=r"(retval.x), \ + "=r"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ + { \ + unsigned long long retval; \ + asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ + "=l"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ + { \ + unsigned int retval; \ + asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ + "=r"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ + { \ + unsigned short retval; \ + asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ + { \ + unsigned short retval; \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " ld."#ptx_modifier".u8 datum, [%1];" \ + " cvt.u16.u8 %0, datum;" \ + "}" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return (unsigned char) retval; \ + } + + +/** + * Define powers-of-two ThreadLoad specializations for the given Cache load modifier + */ +#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ + _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + _CUB_LOAD_1(cub_modifier, ptx_modifier) \ + + +/** + * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + _CUB_LOAD_ALL(LOAD_CA, ca) + _CUB_LOAD_ALL(LOAD_CG, cg) + _CUB_LOAD_ALL(LOAD_CS, cs) + _CUB_LOAD_ALL(LOAD_CV, cv) +#else + _CUB_LOAD_ALL(LOAD_CA, global) + // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 + _CUB_LOAD_ALL(LOAD_CG, volatile.global) + _CUB_LOAD_ALL(LOAD_CS, global) + _CUB_LOAD_ALL(LOAD_CV, volatile.global) +#endif + +#if CUB_PTX_ARCH >= 350 + _CUB_LOAD_ALL(LOAD_LDG, global.nc) +#else + _CUB_LOAD_ALL(LOAD_LDG, global) +#endif + + +// Macro cleanup +#undef _CUB_LOAD_ALL +#undef _CUB_LOAD_1 +#undef _CUB_LOAD_2 +#undef _CUB_LOAD_4 +#undef _CUB_LOAD_8 +#undef _CUB_LOAD_16 + + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( + InputIteratorT itr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + return *itr; +} + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + return *ptr; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type /*is_primitive*/) +{ + T retval = *reinterpret_cast(ptr); + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type /*is_primitive*/) +{ + typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); +/* + VolatileWord words[VOLATILE_MULTIPLE]; + + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + + return *reinterpret_cast(words); +*/ + + T retval; + VolatileWord *words = reinterpret_cast(&retval); + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + // Apply tags for partial-specialization + return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadLoad definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T const *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + typedef typename UnitWord::DeviceWord DeviceWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( + reinterpret_cast(const_cast(ptr)), + words); + + return *reinterpret_cast(words); +} + + +/** + * ThreadLoad definition for generic modifiers + */ +template < + CacheLoadModifier MODIFIER, + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) +{ + // Apply tags for partial-specialization + return ThreadLoad( + itr, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/thread/thread_operators.cuh b/dnn/src/cuda/cub/thread/thread_operators.cuh new file mode 100644 index 00000000..76cd800f --- /dev/null +++ b/dnn/src/cuda/cub/thread/thread_operators.cuh @@ -0,0 +1,317 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple binary operator functor types + */ + +/****************************************************************************** + * Simple functor operators + ******************************************************************************/ + +#pragma once + +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \brief Default equality functor + */ +struct Equality +{ + /// Boolean equality operator, returns (a == b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a == b; + } +}; + + +/** + * \brief Default inequality functor + */ +struct Inequality +{ + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a != b; + } +}; + + +/** + * \brief Inequality functor (wraps equality functor) + */ +template +struct InequalityWrapper +{ + /// Wrapped equality operator + EqualityOp op; + + /// Constructor + __host__ __device__ __forceinline__ + InequalityWrapper(EqualityOp op) : op(op) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) + { + return !op(a, b); + } +}; + + +/** + * \brief Default sum functor + */ +struct Sum +{ + /// Boolean sum operator, returns a + b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return a + b; + } +}; + + +/** + * \brief Default max functor + */ +struct Max +{ + /// Boolean max operator, returns (a > b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MAX(a, b); + } +}; + + +/** + * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) + */ +struct ArgMax +{ + /// Boolean max operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const + { +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + + if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; + } +}; + + +/** + * \brief Default min functor + */ +struct Min +{ + /// Boolean min operator, returns (a < b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MIN(a, b); + } +}; + + +/** + * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) + */ +struct ArgMin +{ + /// Boolean min operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const + { +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + + if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; + } +}; + + +/** + * \brief Default cast functor + */ +template +struct CastOp +{ + /// Cast operator, returns (B) a + template + __host__ __device__ __forceinline__ B operator()(const A &a) const + { + return (B) a; + } +}; + + +/** + * \brief Binary operator wrapper for switching non-commutative scan arguments + */ +template +class SwizzleScanOp +{ +private: + + /// Wrapped scan operator + ScanOp scan_op; + +public: + + /// Constructor + __host__ __device__ __forceinline__ + SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} + + /// Switch the scan arguments + template + __host__ __device__ __forceinline__ + T operator()(const T &a, const T &b) + { + T _a(a); + T _b(b); + + return scan_op(_b, _a); + } +}; + + +/** + * \brief Reduce-by-segment functor. + * + * Given two cub::KeyValuePair inputs \p a and \p b and a + * binary associative combining operator \p f(const T &x, const T &y), + * an instance of this functor returns a cub::KeyValuePair whose \p key + * field is a.key + b.key, and whose \p value field + * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. + * + * ReduceBySegmentOp is an associative, non-commutative binary combining operator + * for input sequences of cub::KeyValuePair pairings. Such + * sequences are typically used to represent a segmented set of values to be reduced + * and a corresponding set of {0,1}-valued integer "head flags" demarcating the + * first value of each segment. + * + */ +template ///< Binary reduction operator to apply to values +struct ReduceBySegmentOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval; + retval.key = first.key + second.key; + retval.value = (second.key) ? + second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate + op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + return retval; + } +}; + + + +template ///< Binary reduction operator to apply to values +struct ReduceByKeyOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval = second; + + if (first.key == second.key) + retval.value = op(first.value, retval.value); + + return retval; + } +}; + + + + + + + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/thread/thread_reduce.cuh b/dnn/src/cuda/cub/thread/thread_reduce.cuh new file mode 100644 index 00000000..4c13688f --- /dev/null +++ b/dnn/src/cuda/cub/thread/thread_reduce.cuh @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential reduction over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + +/** + * Sequential reduction over statically-sized array types + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type /*length*/) +{ + T retval = prefix; + + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + retval = reduction_op(retval, input[i]); + + return retval; +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH LengthT of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + T prefix = input[0]; + return ThreadReduce(input + 1, reduction_op, prefix); +} + + +/** + * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Serial reduction with the specified operator + * + * \tparam LENGTH [inferred] LengthT of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + return ThreadReduce((T*) input, reduction_op); +} + + +} // internal namespace +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/thread/thread_scan.cuh b/dnn/src/cuda/cub/thread/thread_scan.cuh new file mode 100644 index 00000000..8d67549a --- /dev/null +++ b/dnn/src/cuda/cub/thread/thread_scan.cuh @@ -0,0 +1,268 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential prefix scan over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential prefix scan over statically-sized array types + * @{ + */ + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*length*/) +{ + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(exclusive, input[i]); + output[i] = exclusive; + exclusive = inclusive; + } + + return inclusive; +} + + + +/** + * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = prefix; + T exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + + + + + + + + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*length*/) +{ + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(inclusive, input[i]); + output[i] = inclusive; + } + + return inclusive; +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + T inclusive = input[0]; + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + +//@} end member group + +/** @} */ // end group UtilModule + + +} // internal namespace +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/thread/thread_search.cuh b/dnn/src/cuda/cub/thread/thread_search.cuh new file mode 100644 index 00000000..3099080a --- /dev/null +++ b/dnn/src/cuda/cub/thread/thread_search.cuh @@ -0,0 +1,154 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential search + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Computes the begin offsets into A and B for the specific diagonal + */ +template < + typename AIteratorT, + typename BIteratorT, + typename OffsetT, + typename CoordinateT> +__host__ __device__ __forceinline__ void MergePathSearch( + OffsetT diagonal, + AIteratorT a, + BIteratorT b, + OffsetT a_len, + OffsetT b_len, + CoordinateT& path_coordinate) +{ + /// The value type of the input iterator + typedef typename std::iterator_traits::value_type T; + + OffsetT split_min = CUB_MAX(diagonal - b_len, 0); + OffsetT split_max = CUB_MIN(diagonal, a_len); + + while (split_min < split_max) + { + OffsetT split_pivot = (split_min + split_max) >> 1; + if (a[split_pivot] <= b[diagonal - split_pivot - 1]) + { + // Move candidate split range up A, down B + split_min = split_pivot + 1; + } + else + { + // Move candidate split range up B, down A + split_max = split_pivot; + } + } + + path_coordinate.x = CUB_MIN(split_min, a_len); + path_coordinate.y = diagonal - split_min; +} + + + +/** + * \brief Returns the offset of the first value within \p input which does not compare less than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT LowerBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (input[retval + half] < val) + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + else + { + num_items = half; + } + } + + return retval; +} + + +/** + * \brief Returns the offset of the first value within \p input which compares greater than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT UpperBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (val < input[retval + half]) + { + num_items = half; + } + else + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + } + + return retval; +} + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/thread/thread_store.cuh b/dnn/src/cuda/cub/thread/thread_store.cuh new file mode 100644 index 00000000..ec20b36f --- /dev/null +++ b/dnn/src/cuda/cub/thread/thread_store.cuh @@ -0,0 +1,422 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for writing memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory store operations. + */ +enum CacheStoreModifier +{ + STORE_DEFAULT, ///< Default (no modifier) + STORE_WB, ///< Cache write-back all coherent levels + STORE_CG, ///< Cache at global level + STORE_CS, ///< Cache streaming (likely to be accessed once) + STORE_WT, ///< Cache write-through (to system memory) + STORE_VOLATILE, ///< Volatile shared (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit store using cache-global modifier: + * int *d_out; + * int val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 16-bit store using default modifier + * short *d_out; + * short val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 256-bit store using write-through modifier + * double4 *d_out; + * double4 val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 96-bit store using cache-streaming cache modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * \endcode + * + * \tparam MODIFIER [inferred] CacheStoreModifier enumeration + * \tparam InputIteratorT [inferred] Output iterator type \iterator + * \tparam T [inferred] Data type of output value + */ +template < + CacheStoreModifier MODIFIER, + typename OutputIteratorT, + typename T> +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated store iteration (inductive case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T *ptr, T *vals) + { + ThreadStore(ptr + COUNT, vals[COUNT]); + IterateThreadStore::template Store(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) + { + ptr[COUNT] = vals[COUNT]; + IterateThreadStore::Dereference(ptr, vals); + } + +}; + +/// Helper structure for templated store iteration (termination case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} + + template + static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} +}; + + +/** + * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y), \ + "r"(val.z), \ + "r"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val.x), \ + "l"(val.y)); \ + } + + +/** + * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val.x), \ + "h"(val.y), \ + "h"(val.z), \ + "h"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ + { \ + asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val)); \ + } + +/** + * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ + { \ + asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val)); \ + } + + +/** + * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ + { \ + asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val)); \ + } + + +/** + * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ + { \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " cvt.u8.u16 datum, %1;" \ + " st."#ptx_modifier".u8 [%0], datum;" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"((unsigned short) val)); \ + } + +/** + * Define powers-of-two ThreadStore specializations for the given Cache load modifier + */ +#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ + _CUB_STORE_16(cub_modifier, ptx_modifier) \ + _CUB_STORE_8(cub_modifier, ptx_modifier) \ + _CUB_STORE_4(cub_modifier, ptx_modifier) \ + _CUB_STORE_2(cub_modifier, ptx_modifier) \ + _CUB_STORE_1(cub_modifier, ptx_modifier) \ + + +/** + * Define ThreadStore specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + _CUB_STORE_ALL(STORE_WB, wb) + _CUB_STORE_ALL(STORE_CG, cg) + _CUB_STORE_ALL(STORE_CS, cs) + _CUB_STORE_ALL(STORE_WT, wt) +#else + _CUB_STORE_ALL(STORE_WB, global) + _CUB_STORE_ALL(STORE_CG, global) + _CUB_STORE_ALL(STORE_CS, global) + _CUB_STORE_ALL(STORE_WT, volatile.global) +#endif + + +// Macro cleanup +#undef _CUB_STORE_ALL +#undef _CUB_STORE_1 +#undef _CUB_STORE_2 +#undef _CUB_STORE_4 +#undef _CUB_STORE_8 +#undef _CUB_STORE_16 + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ void ThreadStore( + OutputIteratorT itr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + *itr = val; +} + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + *ptr = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type /*is_primitive*/) +{ + *reinterpret_cast(ptr) = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type /*is_primitive*/) +{ + // Create a temporary using shuffle-words, then store using volatile-words + typedef typename UnitWord::VolatileWord VolatileWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + + VolatileWord words[VOLATILE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; + + IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadStore definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + // Create a temporary using shuffle-words, then store using device-words + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; + + IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for generic modifiers + */ +template +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) +{ + ThreadStore( + itr, + val, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_allocator.cuh b/dnn/src/cuda/cub/util_allocator.cuh new file mode 100644 index 00000000..0e6dd048 --- /dev/null +++ b/dnn/src/cuda/cub/util_allocator.cuh @@ -0,0 +1,708 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" +#include "util_debug.cuh" + +#include +#include + +#include "host/mutex.cuh" +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingDeviceAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and stream-safe and is capable of managing cached + * device allocations on multiple devices. It behaves as follows: + * + * \par + * - Allocations from the allocator are associated with an \p active_stream. Once freed, + * the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for + * reuse within other streams when all prior work submitted to \p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingDeviceAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ +struct CachingDeviceAllocator +{ + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor + { + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + int device; // device ordinal + cudaStream_t associated_stream; // Associated associated_stream + cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer and device) + BlockDescriptor(void *d_ptr, int device) : + d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Constructor (suitable for searching maps for a range of suitable blocks, given a device) + BlockDescriptor(int device) : + d_ptr(NULL), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device == b.device) + return (a.d_ptr < b.d_ptr); + else + return (a.device < b.device); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device == b.device) + return (a.bytes < b.bytes); + else + return (a.device < b.device); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each device + typedef std::map GpuCachedBytes; + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + cub::Mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + + GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingDeviceAllocator( + bool skip_cleanup = false, + bool debug = false) + : + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + cudaError_t SetMaxCachedBytes( + size_t max_cached_bytes) + { + // Lock + mutex.Lock(); + + if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); + + this->max_cached_bytes = max_cached_bytes; + + // Unlock + mutex.Unlock(); + + return cudaSuccess; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceAllocate( + int device, ///< [in] Device on which to place the allocation + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + *d_ptr = NULL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + device = entrypoint_device; + } + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key(device); + search_key.associated_stream = active_stream; + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); + + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } + else + { + // Search for a suitable cached allocation: lock + mutex.Lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Iterate through the range of cached blocks on the same device in the same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) + && (block_itr->device == device) + && (block_itr->bin == search_key.bin)) + { + // To prevent races with reusing blocks returned by the host but still + // in use by the device, only consider cached blocks that are + // either (from the active stream) or (from an idle stream) + if ((active_stream == block_itr->associated_stream) || + (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) + { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes[device].free -= search_key.bytes; + cached_bytes[device].live += search_key.bytes; + + if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; + } + + // Done searching: unlock + mutex.Unlock(); + } + + // Allocate the block if necessary + if (!found) + { + // Set runtime's current device to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + // Attempt to allocate + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) + { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", + device, (long long) search_key.bytes, (long long) search_key.associated_stream); + + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error + + // Lock + mutex.Lock(); + + // Iterate the range of free blocks on the same device + BlockDescriptor free_key(device); + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + + while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) + { + // No need to worry about synchronization with the device: cudaFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free device memory and destroy stream event. + if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[device].free -= block_itr->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + cached_blocks.erase(block_itr); + + block_itr++; + } + + // Unlock + mutex.Unlock(); + + // Return under error + if (error) return error; + + // Try to allocate again + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; + } + + // Create ready event + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.Lock(); + live_blocks.insert(search_key); + cached_bytes[device].live += search_key.bytes; + mutex.Unlock(); + + if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + + // Attempt to revert back to previous device if necessary + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + } + + // Copy device pointer to output parameter + *d_ptr = search_key.d_ptr; + + if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + return error; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the current device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); + } + + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceFree( + int device, + void* d_ptr) + { + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } + + // Lock + mutex.Lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr, device); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device].live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) + { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes[device].free += search_key.bytes; + + if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + } + } + + // Unlock + mutex.Unlock(); + + // First set to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + if (recached) + { + // Insert the ready event in the associated stream (must have current device set properly) + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + } + else + { + // Free the allocation from the runtime and cleanup the event. + if (CubDebug(error = cudaFree(d_ptr))) return error; + if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; + + if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + } + + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceFree( + void* d_ptr) + { + return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); + } + + + /** + * \brief Frees all cached device allocations on all devices + */ + cudaError_t FreeAllCached() + { + cudaError_t error = cudaSuccess; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + mutex.Lock(); + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free device memory + if (CubDebug(error = cudaFree(begin->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[current_device].free -= begin->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); + + cached_blocks.erase(begin); + } + + mutex.Unlock(); + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Destructor + */ + virtual ~CachingDeviceAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_arch.cuh b/dnn/src/cuda/cub/util_arch.cuh new file mode 100644 index 00000000..28d81e7c --- /dev/null +++ b/dnn/src/cuda/cub/util_arch.cuh @@ -0,0 +1,151 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Static architectural properties by SM version. + */ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) + #define CUB_USE_COOPERATIVE_GROUPS +#endif + +/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). +#ifndef CUB_PTX_ARCH + #ifndef __CUDA_ARCH__ + #define CUB_PTX_ARCH 0 + #else + #define CUB_PTX_ARCH __CUDA_ARCH__ + #endif +#endif + + +/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. +#ifndef CUB_RUNTIME_FUNCTION + #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) + #define CUB_RUNTIME_ENABLED + #define CUB_RUNTIME_FUNCTION __host__ __device__ + #else + #define CUB_RUNTIME_FUNCTION __host__ + #endif +#endif + + +/// Number of threads per warp +#ifndef CUB_LOG_WARP_THREADS + #define CUB_LOG_WARP_THREADS(arch) \ + (5) + #define CUB_WARP_THREADS(arch) \ + (1 << CUB_LOG_WARP_THREADS(arch)) + + #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) + #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) +#endif + + +/// Number of smem banks +#ifndef CUB_LOG_SMEM_BANKS + #define CUB_LOG_SMEM_BANKS(arch) \ + ((arch >= 200) ? \ + (5) : \ + (4)) + #define CUB_SMEM_BANKS(arch) \ + (1 << CUB_LOG_SMEM_BANKS(arch)) + + #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) + #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) +#endif + + +/// Oversubscription factor +#ifndef CUB_SUBSCRIPTION_FACTOR + #define CUB_SUBSCRIPTION_FACTOR(arch) \ + ((arch >= 300) ? \ + (5) : \ + ((arch >= 200) ? \ + (3) : \ + (10))) + #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) +#endif + + +/// Prefer padding overhead vs X-way conflicts greater than this threshold +#ifndef CUB_PREFER_CONFLICT_OVER_PADDING + #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ + ((arch >= 300) ? \ + (1) : \ + (4)) + #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) +#endif + + +/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. +#ifndef CUB_SCALED_BLOCK_THREADS + #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + (CUB_MIN( \ + NOMINAL_4B_BLOCK_THREADS, \ + CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ + 2, \ + (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) +#endif + +/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread +#ifndef CUB_SCALED_ITEMS_PER_THREAD + #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + CUB_MAX( \ + 1, \ + (sizeof(T) < 4) ? \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) +#endif + +/// Define both nominal threads-per-block and items-per-thread +#ifndef CUB_SCALED_GRANULARITIES + #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ + CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ + CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) +#endif + + + +#endif // Do not document + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_debug.cuh b/dnn/src/cuda/cub/util_debug.cuh new file mode 100644 index 00000000..3ad832e7 --- /dev/null +++ b/dnn/src/cuda/cub/util_debug.cuh @@ -0,0 +1,145 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Error and event logging routines. + * + * The following macros definitions are supported: + * - \p CUB_LOG. Simple event messages are printed to \p stdout. + */ + +#pragma once + +#include +#include "util_namespace.cuh" +#include "util_arch.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/// CUB error reporting macro (prints error messages to stderr) +#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) + #define CUB_STDERR +#endif + + + +/** + * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. + * + * \return The CUDA error. + */ +__host__ __device__ __forceinline__ cudaError_t Debug( + cudaError_t error, + const char* filename, + int line) +{ + (void)filename; + (void)line; +#ifdef CUB_STDERR + if (error) + { + #if (CUB_PTX_ARCH == 0) + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fflush(stderr); + #elif (CUB_PTX_ARCH >= 200) + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); + #endif + } +#endif + return error; +} + + +/** + * \brief Debug macro + */ +#ifndef CubDebug + #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) +#endif + + +/** + * \brief Debug macro with exit + */ +#ifndef CubDebugExit + #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } +#endif + + +/** + * \brief Log macro for printf statements. + */ +#if !defined(_CubLog) + #if !(defined(__clang__) && defined(__CUDA__)) + #if (CUB_PTX_ARCH == 0) + #define _CubLog(format, ...) printf(format,__VA_ARGS__); + #elif (CUB_PTX_ARCH >= 200) + #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); + #endif + #else + // XXX shameless hack for clang around variadic printf... + // Compilies w/o supplying -std=c++11 but shows warning, + // so we sielence them :) + #pragma clang diagnostic ignored "-Wc++11-extensions" + #pragma clang diagnostic ignored "-Wunnamed-type-template-args" + template + inline __host__ __device__ void va_printf(char const* format, Args const&... args) + { + #ifdef __CUDA_ARCH__ + printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); + #else + printf(format, args...); + #endif + } + #ifndef __CUDA_ARCH__ + #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); + #else + #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); + #endif + #endif +#endif + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_device.cuh b/dnn/src/cuda/cub/util_device.cuh new file mode 100644 index 00000000..a5f3b614 --- /dev/null +++ b/dnn/src/cuda/cub/util_device.cuh @@ -0,0 +1,347 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Properties of a given CUDA device and the corresponding PTX bundle + */ + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_namespace.cuh" +#include "util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + */ +template +__host__ __device__ __forceinline__ +cudaError_t AliasTemporaries( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation + void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed + size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +{ + const int ALIGN_BYTES = 256; + const int ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) + { + size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + bytes_needed += ALIGN_BYTES - 1; + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) + { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) + { + return CubDebug(cudaErrorInvalidValue); + } + + // Alias + d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); + for (int i = 0; i < ALLOCATIONS; ++i) + { + allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + + +/** + * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device + */ +template +__global__ void EmptyKernel(void) { } + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) +{ + struct Dummy + { + /// Type definition of the EmptyKernel kernel entry point + typedef void (*EmptyKernelPtr)(); + + /// Force EmptyKernel to be generated if this class is used + CUB_RUNTIME_FUNCTION __forceinline__ + EmptyKernelPtr Empty() + { + return EmptyKernel; + } + }; + + +#ifndef CUB_RUNTIME_ENABLED + (void)ptx_version; + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#elif (CUB_PTX_ARCH > 0) + + ptx_version = CUB_PTX_ARCH; + return cudaSuccess; + +#else + + cudaError_t error = cudaSuccess; + do + { + cudaFuncAttributes empty_kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; + ptx_version = empty_kernel_attrs.ptxVersion * 10; + } + while (0); + + return error; + +#endif +} + + +/** + * \brief Retrieves the SM version (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) +{ +#ifndef CUB_RUNTIME_ENABLED + (void)sm_version; + (void)device_ordinal; + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#else + + cudaError_t error = cudaSuccess; + do + { + // Fill in SM version + int major, minor; + if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; + if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; + sm_version = major * 100 + minor * 10; + } + while (0); + + return error; + +#endif +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Synchronize the stream if specified + */ +CUB_RUNTIME_FUNCTION __forceinline__ +static cudaError_t SyncStream(cudaStream_t stream) +{ +#if (CUB_PTX_ARCH == 0) + return cudaStreamSynchronize(stream); +#else + (void)stream; + // Device can't yet sync on a specific stream + return cudaDeviceSynchronize(); +#endif +} + + +/** + * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. + * + * \par Snippet + * The code snippet below illustrates the use of the MaxSmOccupancy function. + * \par + * \code + * #include // or equivalently + * + * template + * __global__ void ExampleKernel() + * { + * // Allocate shared memory for BlockScan + * __shared__ volatile T buffer[4096]; + * + * ... + * } + * + * ... + * + * // Determine SM occupancy for ExampleKernel specialized for unsigned char + * int max_sm_occupancy; + * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); + * + * // max_sm_occupancy <-- 4 on SM10 + * // max_sm_occupancy <-- 8 on SM20 + * // max_sm_occupancy <-- 12 on SM35 + * + * \endcode + * + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads, ///< [in] Number of threads per thread block + int dynamic_smem_bytes = 0) +{ +#ifndef CUB_RUNTIME_ENABLED + (void)dynamic_smem_bytes; + (void)block_threads; + (void)kernel_ptr; + (void)max_sm_occupancy; + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( + &max_sm_occupancy, + kernel_ptr, + block_threads, + dynamic_smem_bytes); + +#endif // CUB_RUNTIME_ENABLED +} + + +/****************************************************************************** + * Policy management + ******************************************************************************/ + +/** + * Kernel dispatch configuration + */ +struct KernelConfig +{ + int block_threads; + int items_per_thread; + int tile_size; + int sm_occupancy; + + CUB_RUNTIME_FUNCTION __forceinline__ + KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init(KernelPtrT kernel_ptr) + { + block_threads = AgentPolicyT::BLOCK_THREADS; + items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); + return retval; + } +}; + + + +/// Helper for dispatching into a policy chain +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int ptx_version, FunctorT &op) + { + if (ptx_version < PTX_VERSION) { + return PrevPolicyT::Invoke(ptx_version, op); + } + return op.template Invoke(); + } +}; + +/// Helper for dispatching into a policy chain (end-of-chain specialization) +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef PolicyT ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) { + return op.template Invoke(); + } +}; + + + + +#endif // Do not document + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_macro.cuh b/dnn/src/cuda/cub/util_macro.cuh new file mode 100644 index 00000000..ff863654 --- /dev/null +++ b/dnn/src/cuda/cub/util_macro.cuh @@ -0,0 +1,103 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Common C/C++ macro utilities + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +#ifndef CUB_ALIGN + #if defined(_WIN32) || defined(_WIN64) + /// Align struct + #define CUB_ALIGN(bytes) __declspec(align(32)) + #else + /// Align struct + #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) + #endif +#endif + +#ifndef CUB_MAX + /// Select maximum(a, b) + #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) +#endif + +#ifndef CUB_MIN + /// Select minimum(a, b) + #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) +#endif + +#ifndef CUB_QUOTIENT_FLOOR + /// Quotient of x/y rounded down to nearest integer + #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) +#endif + +#ifndef CUB_QUOTIENT_CEILING + /// Quotient of x/y rounded up to nearest integer + #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) +#endif + +#ifndef CUB_ROUND_UP_NEAREST + /// x rounded up to the nearest multiple of y + #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) +#endif + +#ifndef CUB_ROUND_DOWN_NEAREST + /// x rounded down to the nearest multiple of y + #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) +#endif + + +#ifndef CUB_STATIC_ASSERT + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + #define CUB_CAT_(a, b) a ## b + #define CUB_CAT(a, b) CUB_CAT_(a, b) + #endif // DOXYGEN_SHOULD_SKIP_THIS + + /// Static assert + #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] +#endif + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_namespace.cuh b/dnn/src/cuda/cub/util_namespace.cuh new file mode 100644 index 00000000..c8991d08 --- /dev/null +++ b/dnn/src/cuda/cub/util_namespace.cuh @@ -0,0 +1,46 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Place-holder for prefixing the cub namespace + */ + +#pragma once + +// For example: +//#define CUB_NS_PREFIX namespace thrust{ namespace detail { +//#define CUB_NS_POSTFIX } } + +#ifndef CUB_NS_PREFIX +#define CUB_NS_PREFIX +#endif + +#ifndef CUB_NS_POSTFIX +#define CUB_NS_POSTFIX +#endif diff --git a/dnn/src/cuda/cub/util_ptx.cuh b/dnn/src/cuda/cub/util_ptx.cuh new file mode 100644 index 00000000..582ca0d8 --- /dev/null +++ b/dnn/src/cuda/cub/util_ptx.cuh @@ -0,0 +1,758 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * PTX intrinsics + */ + + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" +#include "util_debug.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilPtx + * @{ + */ + + +/****************************************************************************** + * PTX helper macros + ******************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Register modifier for pointer-types (for inlining PTX assembly) + */ +#if defined(_WIN64) || defined(__LP64__) + #define __CUB_LP64__ 1 + // 64-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "l" + #define _CUB_ASM_PTR_SIZE_ "u64" +#else + #define __CUB_LP64__ 0 + // 32-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "r" + #define _CUB_ASM_PTR_SIZE_ "u32" +#endif + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Inlined PTX intrinsics + ******************************************************************************/ + +/** + * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHR_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x >> shift) + addend; +#endif + return ret; +} + + +/** + * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHL_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x << shift) + addend; +#endif + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Bitfield-extract. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type /*byte_len*/) +{ + unsigned int bits; +#if CUB_PTX_ARCH >= 200 + asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); +#else + const unsigned int MASK = (1 << num_bits) - 1; + bits = (source >> bit_start) & MASK; +#endif + return bits; +} + + +/** + * Bitfield-extract for 64-bit types. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type<8> /*byte_len*/) +{ + const unsigned long long MASK = (1ull << num_bits) - 1; + return (source >> bit_start) & MASK; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits) +{ + return BFE(source, bit_start, num_bits, Int2Type()); +} + + +/** + * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. + */ +__device__ __forceinline__ void BFI( + unsigned int &ret, + unsigned int x, + unsigned int y, + unsigned int bit_start, + unsigned int num_bits) +{ +#if CUB_PTX_ARCH >= 200 + asm ("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); +#else + x <<= bit_start; + unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; + unsigned int MASK_Y = ~MASK_X; + ret = (y & MASK_Y) | (x & MASK_X); +#endif +} + + +/** + * \brief Three-operand add. Returns \p x + \p y + \p z. + */ +__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) +{ +#if CUB_PTX_ARCH >= 200 + asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); +#else + x = x + y + z; +#endif + return x; +} + + +/** + * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. + * + * \par + * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: + * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes + * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within + * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} + * + * \par Snippet + * The code snippet below illustrates byte-permute. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * int a = 0x03020100; + * int b = 0x07060504; + * int index = 0x00007531; + * + * int selected = PRMT(a, b, index); // 0x07050301 + * + * \endcode + * + */ +__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) +{ + int ret; + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Sync-threads barrier. + */ +__device__ __forceinline__ void BAR(int count) +{ + asm volatile("bar.sync 1, %0;" : : "r"(count)); +} + +/** + * CTA barrier + */ +__device__ __forceinline__ void CTA_SYNC() +{ + __syncthreads(); +} + + +/** + * CTA barrier with predicate + */ +__device__ __forceinline__ int CTA_SYNC_AND(int p) +{ + return __syncthreads_and(p); +} + + +/** + * Warp barrier + */ +__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + __syncwarp(member_mask); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __any_sync(member_mask, predicate); +#else + return ::__any(predicate); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __all_sync(member_mask, predicate); +#else + return ::__all(predicate); +#endif +} + + +/** + * Warp ballot + */ +__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __ballot_sync(member_mask, predicate); +#else + return __ballot(predicate); +#endif +} + +/** + * Warp synchronous shfl_up + */ +__device__ __forceinline__ +unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.up.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); +#endif + return word; +} + +/** + * Warp synchronous shfl_down + */ +__device__ __forceinline__ +unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.down.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); +#endif + return word; +} + +/** + * Warp synchronous shfl_idx + */ +__device__ __forceinline__ +unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.idx.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); +#endif + return word; +} + +/** + * Floating point multiply. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FMUL_RZ(float a, float b) +{ + float d; + asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); + return d; +} + + +/** + * Floating point multiply-add. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) +{ + float d; + asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); + return d; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Terminates the calling thread + */ +__device__ __forceinline__ void ThreadExit() { + asm volatile("exit;"); +} + + +/** + * \brief Abort execution and generate an interrupt to the host CPU + */ +__device__ __forceinline__ void ThreadTrap() { + asm volatile("trap;"); +} + + +/** + * \brief Returns the row-major linear thread identifier for a multidimensional thread block + */ +__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) +{ + return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + + threadIdx.x; +} + + +/** + * \brief Returns the warp lane ID of the calling thread + */ +__device__ __forceinline__ unsigned int LaneId() +{ + unsigned int ret; + asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); + return ret; +} + + +/** + * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. + */ +__device__ __forceinline__ unsigned int WarpId() +{ + unsigned int ret; + asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLt() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLe() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGt() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGe() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); + return ret; +} + +/** @} */ // end group UtilPtx + + + + +/** + * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) + * \ingroup WarpModule + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * predecessor of its predecessor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. + * + */ +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleUp( + T input, ///< [in] The value to broadcast + int src_offset, ///< [in] The relative down-offset of the peer to read from + int first_thread, ///< [in] Index of first lane in logical warp (typically 0) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask); + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); + output_alias[WORD] = shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) + * \ingroup WarpModule + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * successor of its successor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. + * + */ +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleDown( + T input, ///< [in] The value to broadcast + int src_offset, ///< [in] The relative up-offset of the peer to read from + int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask); + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); + output_alias[WORD] = shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input + * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, + * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. + * + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from thread 0 + * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. + * + */ +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleIndex( + T input, ///< [in] The value to broadcast + int src_lane, ///< [in] Which warp lane is to do the broadcasting + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], + src_lane, + SHFL_C, + member_mask); + + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], + src_lane, + SHFL_C, + member_mask); + + output_alias[WORD] = shuffle_word; + } + + return output; +} + + + +/** + * Compute a 32b mask of threads having the same least-significant + * LABEL_BITS of \p label as the calling thread. + */ +template +inline __device__ unsigned int MatchAny(unsigned int label) +{ + unsigned int retval; + + // Extract masks of common threads for each bit + #pragma unroll + for (int BIT = 0; BIT < LABEL_BITS; ++BIT) + { + unsigned int mask; + unsigned int current_bit = 1 << BIT; + asm ("{\n" + " .reg .pred p;\n" + " and.b32 %0, %1, %2;" + " setp.eq.u32 p, %0, %2;\n" +#ifdef CUB_USE_COOPERATIVE_GROUPS + " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" +#else + " vote.ballot.b32 %0, p;\n" +#endif + " @!p not.b32 %0, %0;\n" + "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); + + // Remove peers who differ + retval = (BIT == 0) ? mask : retval & mask; + } + + return retval; + +// // VOLTA match +// unsigned int retval; +// asm ("{\n" +// " match.any.sync.b32 %0, %1, 0xffffffff;\n" +// "}\n" : "=r"(retval) : "r"(label)); +// return retval; + +} + + + + + + + + + + + + + + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/util_type.cuh b/dnn/src/cuda/cub/util_type.cuh new file mode 100644 index 00000000..0ba41e1e --- /dev/null +++ b/dnn/src/cuda/cub/util_type.cuh @@ -0,0 +1,1167 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Common type manipulation (metaprogramming) utilities + */ + +#pragma once + +#include +#include +#include + +#if (__CUDACC_VER_MAJOR__ >= 9) + #include +#endif + +#include "util_macro.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" + + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + + + +/****************************************************************************** + * Type equality + ******************************************************************************/ + +/** + * \brief Type selection (IF ? ThenType : ElseType) + */ +template +struct If +{ + /// Conditional type result + typedef ThenType Type; // true +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct If +{ + typedef ElseType Type; // false +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Conditional types + ******************************************************************************/ + +/** + * \brief Type equality test + */ +template +struct Equals +{ + enum { + VALUE = 0, + NEGATE = 1 + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Equals +{ + enum { + VALUE = 1, + NEGATE = 0 + }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Static math + ******************************************************************************/ + +/** + * \brief Statically determine log2(N), rounded up. + * + * For example: + * Log2<8>::VALUE // 3 + * Log2<3>::VALUE // 2 + */ +template +struct Log2 +{ + /// Static logarithm value + enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Log2 +{ + enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case + COUNT : + COUNT - 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Statically determine if N is a power-of-two + */ +template +struct PowerOfTwo +{ + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + + + +/****************************************************************************** + * Pointer vs. iterator detection + ******************************************************************************/ + +/** + * \brief Pointer vs. iterator + */ +template +struct IsPointer +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsPointer +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Qualifier detection + ******************************************************************************/ + +/** + * \brief Volatile modifier test + */ +template +struct IsVolatile +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsVolatile +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Qualifier removal + ******************************************************************************/ + +/** + * \brief Removes \p const and \p volatile qualifiers from type \p Tp. + * + * For example: + * typename RemoveQualifiers::Type // int; + */ +template +struct RemoveQualifiers +{ + /// Type without \p const and \p volatile qualifiers + typedef Up Type; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + + +/****************************************************************************** + * Marker types + ******************************************************************************/ + +/** + * \brief A simple "NULL" marker type + */ +struct NullType +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template + __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } + + __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } + + __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } + +#endif // DOXYGEN_SHOULD_SKIP_THIS +}; + + +/** + * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) + */ +template +struct Int2Type +{ + enum {VALUE = A}; +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/****************************************************************************** + * Size and alignment + ******************************************************************************/ + +/// Structure alignment +template +struct AlignBytes +{ + struct Pad + { + T val; + char byte; + }; + + enum + { + /// The "true CUDA" alignment of T in bytes + ALIGN_BYTES = sizeof(Pad) - sizeof(T) + }; + + /// The "truly aligned" type + typedef T Type; +}; + +// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree +// with device C++ compilers (EDG) on types passed as template parameters through +// kernel functions + +#define __CUB_ALIGN_BYTES(t, b) \ + template <> struct AlignBytes \ + { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; + +__CUB_ALIGN_BYTES(short4, 8) +__CUB_ALIGN_BYTES(ushort4, 8) +__CUB_ALIGN_BYTES(int2, 8) +__CUB_ALIGN_BYTES(uint2, 8) +__CUB_ALIGN_BYTES(long long, 8) +__CUB_ALIGN_BYTES(unsigned long long, 8) +__CUB_ALIGN_BYTES(float2, 8) +__CUB_ALIGN_BYTES(double, 8) +#ifdef _WIN32 + __CUB_ALIGN_BYTES(long2, 8) + __CUB_ALIGN_BYTES(ulong2, 8) +#else + __CUB_ALIGN_BYTES(long2, 16) + __CUB_ALIGN_BYTES(ulong2, 16) +#endif +__CUB_ALIGN_BYTES(int4, 16) +__CUB_ALIGN_BYTES(uint4, 16) +__CUB_ALIGN_BYTES(float4, 16) +__CUB_ALIGN_BYTES(long4, 16) +__CUB_ALIGN_BYTES(ulong4, 16) +__CUB_ALIGN_BYTES(longlong2, 16) +__CUB_ALIGN_BYTES(ulonglong2, 16) +__CUB_ALIGN_BYTES(double2, 16) +__CUB_ALIGN_BYTES(longlong4, 16) +__CUB_ALIGN_BYTES(ulonglong4, 16) +__CUB_ALIGN_BYTES(double4, 16) + +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; + + +/// Unit-words of data movement +template +struct UnitWord +{ + enum { + ALIGN_BYTES = AlignBytes::ALIGN_BYTES + }; + + template + struct IsMultiple + { + enum { + UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, + IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) + }; + }; + + /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned int, + typename If::IS_MULTIPLE, + unsigned short, + unsigned char>::Type>::Type ShuffleWord; + + /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned long long, + ShuffleWord>::Type VolatileWord; + + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + ulonglong2, + VolatileWord>::Type DeviceWord; + + /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + uint4, + typename If::IS_MULTIPLE, + uint2, + ShuffleWord>::Type>::Type TextureWord; +}; + + +// float2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint2 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef unsigned long long DeviceWord; +#endif + typedef float2 TextureWord; +}; + +// float4 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint4 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef ulonglong2 DeviceWord; +#endif + typedef float4 TextureWord; +}; + + +// char2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef unsigned short ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef unsigned short VolatileWord; + typedef short DeviceWord; +#else + typedef unsigned short VolatileWord; + typedef unsigned short DeviceWord; +#endif + typedef unsigned short TextureWord; +}; + + +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Vector type inference utilities. + ******************************************************************************/ + +/** + * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. + */ +template struct CubVector; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +enum +{ + /// The maximum number of elements in CUDA vector types + MAX_VEC_ELEMENTS = 4, +}; + + +/** + * Generic vector-1 type + */ +template +struct CubVector +{ + T x; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-2 type + */ +template +struct CubVector +{ + T x; + T y; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-3 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-4 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + T w; + + typedef T BaseType; + typedef CubVector Type; +}; + + +/** + * Macro for expanding partially-specialized built-in vector types + */ +#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ + \ + template<> struct CubVector : short_type##1 \ + { \ + typedef base_type BaseType; \ + typedef short_type##1 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##2 \ + { \ + typedef base_type BaseType; \ + typedef short_type##2 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##3 \ + { \ + typedef base_type BaseType; \ + typedef short_type##3 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##4 \ + { \ + typedef base_type BaseType; \ + typedef short_type##4 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + retval.w = w + other.w; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + retval.w = w - other.w; \ + return retval; \ + } \ + }; + + + +// Expand CUDA vector types for built-in primitives +CUB_DEFINE_VECTOR_TYPE(char, char) +CUB_DEFINE_VECTOR_TYPE(signed char, char) +CUB_DEFINE_VECTOR_TYPE(short, short) +CUB_DEFINE_VECTOR_TYPE(int, int) +CUB_DEFINE_VECTOR_TYPE(long, long) +CUB_DEFINE_VECTOR_TYPE(long long, longlong) +CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) +CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) +CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) +CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) +CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) +CUB_DEFINE_VECTOR_TYPE(float, float) +CUB_DEFINE_VECTOR_TYPE(double, double) +CUB_DEFINE_VECTOR_TYPE(bool, uchar) + +// Undefine macros +#undef CUB_DEFINE_VECTOR_TYPE + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Wrapper types + ******************************************************************************/ + +/** + * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions + */ +template +struct Uninitialized +{ + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + WORDS = sizeof(T) / sizeof(DeviceWord) + }; + + /// Backing storage + DeviceWord storage[WORDS]; + + /// Alias + __host__ __device__ __forceinline__ T& Alias() + { + return reinterpret_cast(*this); + } +}; + + +/** + * \brief A key identifier paired with a corresponding value + */ +template < + typename _Key, + typename _Value +#if defined(_WIN32) && !defined(_WIN64) + , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) + , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) +#endif // #if defined(_WIN32) && !defined(_WIN64) + > +struct KeyValuePair +{ + typedef _Key Key; ///< Key data type + typedef _Value Value; ///< Value data type + + Key key; ///< Item key + Value value; ///< Item value + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#if defined(_WIN32) && !defined(_WIN64) + +/** + * Win32 won't do 16B alignment. This can present two problems for + * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: + * 1) If a smaller-aligned item were to be listed first, the host compiler places the + * should-be-16B item at too early an offset (and disagrees with device compiler) + * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size + * of the struct wrong (and disagrees with device compiler) + * + * So we put the larger-should-be-aligned item first, and explicitly pad the + * end of the struct + */ + +/// Smaller key specialization +template +struct KeyValuePair +{ + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; + + Value value; // Value has larger would-be alignment and goes first + Key key; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + + +/// Smaller value specialization +template +struct KeyValuePair +{ + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; + + Key key; // Key has larger would-be alignment and goes first + Value value; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#endif // #if defined(_WIN32) && !defined(_WIN64) + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * \brief A wrapper for passing simple static arrays as kernel parameters + */ +template +struct ArrayWrapper +{ + + /// Statically-sized array of type \p T + T array[COUNT]; + + /// Constructor + __host__ __device__ __forceinline__ ArrayWrapper() {} +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. + * + * Many multi-pass computations require a pair of "ping-pong" storage + * buffers (e.g., one for reading from and the other for writing to, and then + * vice-versa for the subsequent pass). This structure wraps a set of device + * buffers and a "selector" member to track which is "current". + */ +template +struct DoubleBuffer +{ + /// Pair of device buffer pointers + T *d_buffers[2]; + + /// Selector into \p d_buffers (i.e., the active/valid buffer) + int selector; + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer() + { + selector = 0; + d_buffers[0] = NULL; + d_buffers[1] = NULL; + } + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer( + T *d_current, ///< The currently valid buffer + T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current + { + selector = 0; + d_buffers[0] = d_current; + d_buffers[1] = d_alternate; + } + + /// \brief Return pointer to the currently valid buffer + __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } + + /// \brief Return pointer to the currently invalid buffer + __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } + +}; + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + + +/** + * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name + */ +#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ + template \ + struct detector_name \ + { \ + template \ + static char& test(typename C::nested_type_name*); \ + template \ + static int& test(...); \ + enum \ + { \ + VALUE = sizeof(test(0)) < sizeof(int) \ + }; \ + }; + + + +/****************************************************************************** + * Simple enable-if (similar to Boost) + ******************************************************************************/ + +/** + * \brief Simple enable-if (similar to Boost) + */ +template +struct EnableIf +{ + /// Enable-if type for SFINAE dummy variables + typedef T Type; +}; + + +template +struct EnableIf {}; + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + +/** + * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) + */ +template +struct BinaryOpHasIdxParam +{ +private: +/* + template struct SFINAE1 {}; + template struct SFINAE2 {}; + template struct SFINAE3 {}; + template struct SFINAE4 {}; +*/ + template struct SFINAE5 {}; + template struct SFINAE6 {}; + template struct SFINAE7 {}; + template struct SFINAE8 {}; +/* + template static char Test(SFINAE1 *); + template static char Test(SFINAE2 *); + template static char Test(SFINAE3 *); + template static char Test(SFINAE4 *); +*/ + template __host__ __device__ static char Test(SFINAE5 *); + template __host__ __device__ static char Test(SFINAE6 *); + template __host__ __device__ static char Test(SFINAE7 *); + template __host__ __device__ static char Test(SFINAE8 *); + + template static int Test(...); + +public: + + /// Whether the functor BinaryOp has a third unsigned int index param + static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); +}; + + + + +/****************************************************************************** + * Simple type traits utilities. + * + * For example: + * Traits::CATEGORY // SIGNED_INTEGER + * Traits::NULL_TYPE // true + * Traits::CATEGORY // NOT_A_NUMBER + * Traits::PRIMITIVE; // false + * + ******************************************************************************/ + +/** + * \brief Basic type traits categories + */ +enum Category +{ + NOT_A_NUMBER, + SIGNED_INTEGER, + UNSIGNED_INTEGER, + FLOATING_POINT +}; + + +/** + * \brief Basic type traits + */ +template +struct BaseTraits +{ + /// Category + static const Category CATEGORY = _CATEGORY; + enum + { + PRIMITIVE = _PRIMITIVE, + NULL_TYPE = _NULL_TYPE, + }; +}; + + +/** + * Basic type traits (unsigned primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = UNSIGNED_INTEGER; + static const UnsignedBits LOWEST_KEY = UnsignedBits(0); + static const UnsignedBits MAX_KEY = UnsignedBits(-1); + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key; + } + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key; + } + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + + +/** + * Basic type traits (signed primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = SIGNED_INTEGER; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits LOWEST_KEY = HIGH_BIT; + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + +template +struct FpLimits; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ float Max() { + return FLT_MAX; + } + + static __host__ __device__ __forceinline__ float Lowest() { + return FLT_MAX * float(-1); + } +}; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ double Max() { + return DBL_MAX; + } + + static __host__ __device__ __forceinline__ double Lowest() { + return DBL_MAX * double(-1); + } +}; + + +#if (__CUDACC_VER_MAJOR__ >= 9) +template <> +struct FpLimits<__half> +{ + static __host__ __device__ __forceinline__ __half Max() { + unsigned short max_word = 0x7BFF; + return reinterpret_cast<__half&>(max_word); + } + + static __host__ __device__ __forceinline__ __half Lowest() { + unsigned short lowest_word = 0xFBFF; + return reinterpret_cast<__half&>(lowest_word); + } +}; +#endif + + +/** + * Basic type traits (fp primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = FLOATING_POINT; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; + return key ^ mask; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); + return key ^ mask; + }; + + static __host__ __device__ __forceinline__ T Max() { + return FpLimits::Max(); + } + + static __host__ __device__ __forceinline__ T Lowest() { + return FpLimits::Lowest(); + } +}; + + +/** + * \brief Numeric type traits + */ +template struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +#if (__CUDACC_VER_MAJOR__ >= 9) + template <> struct NumericTraits<__half> : BaseTraits {}; +#endif + +template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; + + + +/** + * \brief Type traits + */ +template +struct Traits : NumericTraits::Type> {}; + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/version b/dnn/src/cuda/cub/version new file mode 100644 index 00000000..27f9cd32 --- /dev/null +++ b/dnn/src/cuda/cub/version @@ -0,0 +1 @@ +1.8.0 diff --git a/dnn/src/cuda/cub/warp/specializations/warp_reduce_shfl.cuh b/dnn/src/cuda/cub/warp/specializations/warp_reduce_shfl.cuh new file mode 100644 index 00000000..bbbf37e5 --- /dev/null +++ b/dnn/src/cuda/cub/warp/specializations/warp_reduce_shfl.cuh @@ -0,0 +1,541 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_ptx.cuh" +#include "../../util_type.cuh" +#include "../../util_macro.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp reduction steps + STEPS = Log2::VALUE, + + /// Number of logical warps in a PTX warp + LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + + }; + + template + struct IsInteger + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + + /// Shared memory storage layout type + typedef NullType TempStorage; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + /// Lane index in logical warp + unsigned int lane_id; + + /// Logical warp index in 32-thread physical warp + unsigned int warp_id; + + /// 32-thread physical warp member mask of logical warp + unsigned int member_mask; + + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpReduceShfl( + TempStorage &/*temp_storage*/) + { + lane_id = LaneId(); + warp_id = 0; + member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + + if (!IS_ARCH_WARP) + { + warp_id = lane_id / LOGICAL_WARP_THREADS; + lane_id = lane_id % LOGICAL_WARP_THREADS; + member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); + } + } + + + //--------------------------------------------------------------------- + // Reduction steps + //--------------------------------------------------------------------- + + /// Reduction (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int ReduceStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across fp32 types) + __device__ __forceinline__ float ReduceStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long ReduceStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across long long types) + __device__ __forceinline__ long long ReduceStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across double types) + __device__ __forceinline__ double ReduceStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); + + output.key = input.key; + output.value = ReduceStep( + input.value, + cub::Sum(), + last_lane, + offset, + Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key != other_key) + output.value = input.value; + + return output; + } + + + + /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } + + + /// Reduction step (generic) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T output = input; + + _T temp = ShuffleDown(output, offset, last_lane, member_mask); + + // Perform reduction op if valid + if (offset + lane_id <= last_lane) + output = reduction_op(input, temp); + + return output; + } + + + /// Reduction step (specialized for small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + /// Reduction step (specialized for types other than small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + //--------------------------------------------------------------------- + // Templated inclusive scan iteration + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ReduceStep( + T& input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + Int2Type /*step*/) + { + input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); + + ReduceStep(input, reduction_op, last_lane, Int2Type()); + } + + template + __device__ __forceinline__ void ReduceStep( + T& /*input*/, ///< [in] Calling thread's input item. + ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator + int /*last_lane*/, ///< [in] Index of last lane in segment + Int2Type /*step*/) + {} + + + //--------------------------------------------------------------------- + // Reduction operations + //--------------------------------------------------------------------- + + /// Reduction + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + int last_lane = (ALL_LANES_VALID) ? + LOGICAL_WARP_THREADS - 1 : + valid_items - 1; + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } + + + /// Segmented reduction + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + // Convert to tail-segmented + if (HEAD_SEGMENTED) + warp_flags >>= 1; + + // Mask out the bits below the current thread + warp_flags &= LaneMaskGe(); + + // Mask of physical lanes outside the logical warp and convert to logical lanemask + if (!IS_ARCH_WARP) + { + warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS); + } + + // Mask in the last lane of logical warp + warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); + + // Find the next set flag + int last_lane = __clz(__brev(warp_flags)); + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/warp/specializations/warp_reduce_smem.cuh b/dnn/src/cuda/cub/warp/specializations/warp_reduce_smem.cuh new file mode 100644 index 00000000..7baa573b --- /dev/null +++ b/dnn/src/cuda/cub/warp/specializations/warp_reduce_smem.cuh @@ -0,0 +1,372 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + + /// FlagT status (when not using ballot) + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + /// Shared memory flag type + typedef unsigned char SmemFlag; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + struct _TempStorage + { + T reduce[WARP_SMEM_ELEMENTS]; + SmemFlag flags[WARP_SMEM_ELEMENTS]; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpReduceSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Regular reduction + //--------------------------------------------------------------------- + + /** + * Reduction step + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp, + int STEP> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + + return ReduceStep(input, valid_items, reduction_op, Int2Type()); + } + + + /** + * Reduction step (terminate) + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp /*reduction_op*/, ///< [in] Reduction operator + Int2Type /*step*/) + { + return input; + } + + + //--------------------------------------------------------------------- + // Segmented reduction + //--------------------------------------------------------------------- + + + /** + * Ballot-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + if (!HEAD_SEGMENTED) + warp_flags <<= 1; + + // Keep bits above the current thread. + warp_flags &= LaneMaskGt(); + + // Accommodate packing of multiple logical warps in a single physical warp + if (!IS_ARCH_WARP) + { + warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + } + + // Find next flag + int next_flag = __clz(__brev(warp_flags)); + + // Clip the next segment at the warp boundary if necessary + if (LOGICAL_WARP_THREADS != 32) + next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); + + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input into buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if (OFFSET + lane_id < next_flag) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + } + + return input; + } + + + /** + * Smem-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + enum + { + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + // Alias flags onto shared data storage + volatile SmemFlag *flag_storage = temp_storage.flags; + + SmemFlag flag_status = (flag) ? SET : UNSET; + + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Get peer from buffer + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + + WARP_SYNC(member_mask); + + // Share flag through buffer + flag_storage[lane_id] = flag_status; + + // Get peer flag from buffer + SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; + + // Update input if peer was in range + if (lane_id < LOGICAL_WARP_THREADS - OFFSET) + { + if (HEAD_SEGMENTED) + { + // Head-segmented + if ((flag_status & SEEN) == 0) + { + // Has not seen a more distant head flag + if (peer_flag_status & SET) + { + // Has now seen a head flag + flag_status |= SEEN; + } + else + { + // Peer is not a head flag: grab its count + input = reduction_op(input, peer_addend); + } + + // Update seen status to include that of peer + flag_status |= (peer_flag_status & SEEN); + } + } + else + { + // Tail-segmented. Simply propagate flag status + if (!flag_status) + { + input = reduction_op(input, peer_addend); + flag_status |= peer_flag_status; + } + + } + } + } + + return input; + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * Reduction + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op) ///< [in] Reduction operator + { + return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); + } + + + /** + * Segmented reduction + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Reduction operator + { + return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/warp/specializations/warp_scan_shfl.cuh b/dnn/src/cuda/cub/warp/specializations/warp_scan_shfl.cuh new file mode 100644 index 00000000..7f4e1c94 --- /dev/null +++ b/dnn/src/cuda/cub/warp/specializations/warp_scan_shfl.cuh @@ -0,0 +1,632 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_type.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + }; + + template + struct IntegerTraits + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + /// Shared memory storage layout type + struct TempStorage {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + /// Lane index in logical warp + unsigned int lane_id; + + /// Logical warp index in 32-thread physical warp + unsigned int warp_id; + + /// 32-thread physical warp member mask of logical warp + unsigned int member_mask; + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpScanShfl( + TempStorage &/*temp_storage*/) + { + lane_id = LaneId(); + warp_id = 0; + member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + + if (!IS_ARCH_WARP) + { + warp_id = lane_id / LOGICAL_WARP_THREADS; + lane_id = lane_id % LOGICAL_WARP_THREADS; + member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scan steps + //--------------------------------------------------------------------- + + /// Inclusive prefix scan step (specialized for summation across int32 types) + __device__ __forceinline__ int InclusiveScanStep( + int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + /// Inclusive prefix scan step (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int InclusiveScanStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp32 types) + __device__ __forceinline__ float InclusiveScanStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long InclusiveScanStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across long long types) + __device__ __forceinline__ long long InclusiveScanStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp64 types) + __device__ __forceinline__ double InclusiveScanStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + +/* + /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePairInclusiveScanStep( + KeyValuePair input, ///< [in] Calling thread's input item. + ReduceBySegmentOp scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } +*/ + + /// Inclusive prefix scan step (generic) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T temp = ShuffleUp(input, offset, first_lane, member_mask); + + // Perform scan op if from a valid peer + _T output = scan_op(temp, input); + if (static_cast(lane_id) < first_lane + offset) + output = input; + + return output; + } + + + /// Inclusive prefix scan step (specialized for small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return ShuffleIndex(input, src_lane, member_mask); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + _T input, ///< [in] Calling thread's input item. + _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + // Iterate scan steps + int segment_first_lane = 0; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output = InclusiveScanStep( + inclusive_output, + scan_op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + + } + + /// Inclusive scan, specialized for reduce-value-by-key + template + __device__ __forceinline__ void InclusiveScan( + KeyValuePair input, ///< [in] Calling thread's input item. + KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ReduceByKeyOp scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); + + unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); + + // Mask away all lanes greater than ours + ballot = ballot & LaneMaskLe(); + + // Find index of first set bit + int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output.value = InclusiveScanStep( + inclusive_output.value, + scan_op.op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); + Update(input, inclusive, exclusive, scan_op, is_integer); + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); + Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/warp/specializations/warp_scan_smem.cuh b/dnn/src/cuda/cub/warp/specializations/warp_scan_smem.cuh new file mode 100644 index 00000000..3237fcbf --- /dev/null +++ b/dnn/src/cuda/cub/warp/specializations/warp_scan_smem.cuh @@ -0,0 +1,397 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + }; + + /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) + typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpScanSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) + template < + bool HAS_IDENTITY, + int STEP, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &partial, + ScanOp scan_op, + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share partial into buffer + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); + + WARP_SYNC(member_mask); + + // Update partial if addend is in range + if (HAS_IDENTITY || (lane_id >= OFFSET)) + { + T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); + partial = scan_op(addend, partial); + } + WARP_SYNC(member_mask); + + ScanStep(partial, scan_op, Int2Type()); + } + + + /// Basic inclusive scan iteration(template unrolled, base-case specialization) + template < + bool HAS_IDENTITY, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &/*partial*/, + ScanOp /*scan_op*/, + Int2Type /*step*/) + {} + + + /// Inclusive prefix scan (specialized for summation across primitive types) + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + T identity = 0; + ThreadStore(&temp_storage[lane_id], (CellT) identity); + + WARP_SYNC(member_mask); + + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /// Inclusive prefix scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + if (lane_id == src_lane) + { + ThreadStore(temp_storage, (CellT) input); + } + + WARP_SYNC(member_mask); + + return (T)ThreadLoad(temp_storage); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Retrieve aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT /*scan_op*/, + IsIntegerT /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + cub::Sum /*scan_o*/, + Int2Type /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + exclusive = inclusive - input; + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + // Broadcast warp aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + + // Update inclusive with initial value + inclusive = scan_op(initial_value, inclusive); + + // Get exclusive from exclusive + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); + + if (lane_id == 0) + exclusive = initial_value; + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/warp/warp_reduce.cuh b/dnn/src/cuda/cub/warp/warp_reduce.cuh new file mode 100644 index 00000000..189896b0 --- /dev/null +++ b/dnn/src/cuda/cub/warp/warp_reduce.cuh @@ -0,0 +1,612 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_reduce_shfl.cuh" +#include "specializations/warp_reduce_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) + * + * \tparam T The reduction input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpReduce} + * \par + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + * \par + * The code snippet below illustrates a single warp sum reduction within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a reduction + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sum to lane0 + * int aggregate = WarpReduce(temp_storage).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. + * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + }; + +public: + + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpReduceShfl, + WarpReduceSmem >::Type InternalWarpReduce; + + #endif // DOXYGEN_SHOULD_SKIP_THIS + + +private: + + /// Shared memory storage layout type for WarpReduce + typedef typename InternalWarpReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + +public: + + /// \smemstorage{WarpReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()) + {} + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); + } + + /** + * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).Sum( + * thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is + * undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + // Determine if we don't need bounds checking + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( + * thread_data, head_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * + */ + template < + typename FlagT> + __device__ __forceinline__ T HeadSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return HeadSegmentedReduce(input, head_flag, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( + * thread_data, tail_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename FlagT> + __device__ __forceinline__ T TailSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return TailSegmentedReduce(input, tail_flag, cub::Sum()); + } + + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + /** + * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp max reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide reductions to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( + * thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, + * \p 95, and \p 127, respectively (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); + } + + /** + * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).Reduce( + * thread_data, cub::Max(), valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is + * undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction operator + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( + * thread_data, head_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T HeadSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( + * thread_data, tail_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T TailSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); + } + + + + //@} end member group +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cub/warp/warp_scan.cuh b/dnn/src/cuda/cub/warp/warp_scan.cuh new file mode 100644 index 00000000..c7af0d34 --- /dev/null +++ b/dnn/src/cuda/cub/warp/warp_scan.cuh @@ -0,0 +1,936 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_scan_shfl.cuh" +#include "specializations/warp_scan_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) + * + * \tparam T The scan input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - Supports non-commutative scan operators + * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic scan) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpScan} + * \par + * The code snippet below illustrates four concurrent warp prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, 3, ..., 31}. + * + * \par + * The code snippet below illustrates a single warp prefix sum within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a prefix sum + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), + + /// Whether the data type is an integer (which has fully-associative addition) + IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) + }; + + /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpScanShfl, + WarpScanSmem >::Type InternalWarpScan; + + /// Shared memory storage layout type for WarpScan + typedef typename InternalWarpScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + unsigned int lane_id; + + + + /****************************************************************************** + * Public types + ******************************************************************************/ + +public: + + /// \smemstorage{WarpScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + InclusiveScan(input, inclusive_output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); + } + + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan( + * thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + initial_value, + Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Combination (inclusive & exclusive) prefix scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Data exchange + *********************************************************************/ + //@{ + + /** + * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the warp-wide broadcasts of values from + * lanes0 in each of four warps to all other threads in those warps. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Broadcast from lane0 in each warp to all other threads in the warp + * int warp_id = threadIdx.x / 32; + * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p thread_data will be + * {0, 0, ..., 0} in warp0, + * {32, 32, ..., 32} in warp1, + * {64, 64, ..., 64} in warp2, etc. + */ + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return InternalWarpScan(temp_storage).Broadcast(input, src_lane); + } + + //@} end member group + +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/dnn/src/cuda/cuda_shfl_compat.cuh b/dnn/src/cuda/cuda_shfl_compat.cuh new file mode 100644 index 00000000..85ac9e12 --- /dev/null +++ b/dnn/src/cuda/cuda_shfl_compat.cuh @@ -0,0 +1,20 @@ +/** + * \file dnn/src/cuda/cuda_shfl_compat.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#if __CUDACC_VER_MAJOR__ >= 9 +#define __shfl(x, y, z) __shfl_sync(0xffffffffu, x, y, z) +#define __shfl_up(x, y, z) __shfl_up_sync(0xffffffffu, x, y, z) +#define __shfl_down(x, y, z) __shfl_down_sync(0xffffffffu, x, y, z) +#define __shfl_xor(x, y, z) __shfl_xor_sync(0xffffffffu, x, y, z) +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cudnn_with_check.h b/dnn/src/cuda/cudnn_with_check.h new file mode 100644 index 00000000..4511f99d --- /dev/null +++ b/dnn/src/cuda/cudnn_with_check.h @@ -0,0 +1,18 @@ +/** + * \file dnn/src/cuda/cudnn_with_check.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include + +#if !(CUDNN_MAJOR >= 5) +#error "CUDNN must be version at least 5." +#endif diff --git a/dnn/src/cuda/cudnn_wrapper.cpp b/dnn/src/cuda/cudnn_wrapper.cpp new file mode 100644 index 00000000..e2025588 --- /dev/null +++ b/dnn/src/cuda/cudnn_wrapper.cpp @@ -0,0 +1,435 @@ +/** + * \file dnn/src/cuda/cudnn_wrapper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/cudnn_wrapper.h" + +#include "src/common/utils.h" +#include "src/cuda/utils.h" + +namespace { + +using namespace megdnn; + +cudnnDataType_t to_cudnn_dtype(DType type, + const param::Convolution::Format format = {}) { + switch (type.enumv()) { + case DTypeEnum::Float32: + return CUDNN_DATA_FLOAT; + case DTypeEnum::Float16: + return CUDNN_DATA_HALF; +#if CUDNN_MAJOR >= 7 + case DTypeEnum::Int32: + case DTypeEnum::QuantizedS32: + return CUDNN_DATA_INT32; +#endif +#if CUDNN_MAJOR >= 6 + case DTypeEnum::QuantizedS8: { + if (format == param::Convolution::Format::NCHW4) + return CUDNN_DATA_INT8x4; +#if CUDNN_VERSION >= 7500 + else if (format == param::Convolution::Format::NCHW32) + return CUDNN_DATA_INT8x32; +#endif + else + return CUDNN_DATA_INT8; + } + + case DTypeEnum::Int8: { + if (format == param::Convolution::Format::NCHW4) + return CUDNN_DATA_INT8x4; +#if CUDNN_VERSION >= 7500 + else if (format == param::Convolution::Format::NCHW32) + return CUDNN_DATA_INT8x32; +#endif + else + return CUDNN_DATA_INT8; + } +#endif + default: +#if CUDNN_MAJOR >= 6 + megdnn_throw(megdnn_mangle("dtype must be float16/float32/int8/int32")); +#else + megdnn_throw(megdnn_mangle("dtype must be float16/float32")); +#endif + } + +} + +cudnnTensorFormat_t to_cudnn_format(const param::Convolution::Format format) { + switch (format) { + case param::Convolution::Format::NCHW: + return CUDNN_TENSOR_NCHW; +#if CUDNN_MAJOR >= 7 + case param::Convolution::Format::NCHW4: + case param::Convolution::Format::NCHW32: + return CUDNN_TENSOR_NCHW_VECT_C; +#endif + case param::Convolution::Format::NHWC: + return CUDNN_TENSOR_NHWC; + default: + megdnn_assert_internal(0); + } +} + +} // namespace + +namespace megdnn { +namespace cuda { + +cudnnDataType_t get_compute_type_fp16( + param::Convolution::ComputeMode comp_mode) { + using Param = param::Convolution; + cudnnDataType_t compute_type; + if (comp_mode == Param::ComputeMode::DEFAULT) { + // TRUE_HALF_CONFIG + if (is_compute_capability_required(5, 3)) { + compute_type = CUDNN_DATA_HALF; + } else { + auto&& device_prop = current_device_prop(); + int major = device_prop.major, minor = device_prop.minor; + MEGDNN_MARK_USED_VAR(major); + MEGDNN_MARK_USED_VAR(minor); + megdnn_log_warn( + "TRUE_HALF_CONFIG only supported on architectures with " + "true fp16 support, i.e., compute capability 5.3 and " + "later (got %d.%d). Use PSEUDO_HALF_CONFIG instead", + major, minor); + compute_type = CUDNN_DATA_FLOAT; + } + } else { + megdnn_assert(comp_mode == Param::ComputeMode::FLOAT32); + // PSEUDO_HALF_CONFIG + compute_type = CUDNN_DATA_FLOAT; + } + return compute_type; +} + +TensorDesc::TensorDesc() { + cudnn_check(cudnnCreateTensorDescriptor(&desc)); +} + +TensorDesc::~TensorDesc() { + cudnn_check(cudnnDestroyTensorDescriptor(desc)); +} + +void TensorDesc::set(const TensorLayout& layout, + const param::Convolution::Format format) { + // Layout can be not contiguous; group conv needs it. + // megdnn_assert_contiguous(layout); + if (format == param::Convolution::Format::NCHW4 || + format == param::Convolution::Format::NCHW32) + megdnn_assert_eq_size_t(layout.ndim, 5_z); + else + megdnn_assert_eq_size_t(layout.ndim, 4_z); + + size_t c_pos, spatial_pos; + if (format == param::Convolution::Format::NCHW || + format == param::Convolution::Format::NCHW4 || + format == param::Convolution::Format::NCHW32) { + c_pos = 1; + spatial_pos = 2; + } else { + megdnn_assert(format == param::Convolution::Format::NHWC); + c_pos = 3; + spatial_pos = 1; + } + if (format == param::Convolution::Format::NCHW4) { + megdnn_assert(layout.is_physical_contiguous()); + cudnn_check(cudnnSetTensor4dDescriptor( + desc, to_cudnn_format(format), + to_cudnn_dtype(layout.dtype, format), layout.shape[0], + layout.shape[c_pos] * 4, layout.shape[spatial_pos + 0], + layout.shape[spatial_pos + 1])); + } else if (format == param::Convolution::Format::NCHW32) { + megdnn_assert(layout.is_physical_contiguous()); + cudnn_check(cudnnSetTensor4dDescriptor( + desc, to_cudnn_format(format), + to_cudnn_dtype(layout.dtype, format), layout.shape[0], + layout.shape[c_pos] * 32, layout.shape[spatial_pos + 0], + layout.shape[spatial_pos + 1])); + + } else { + cudnn_check(cudnnSetTensor4dDescriptorEx( + desc, to_cudnn_dtype(layout.dtype), layout.shape[0], + layout.shape[c_pos], layout.shape[spatial_pos + 0], + layout.shape[spatial_pos + 1], layout.stride[0], + layout.stride[c_pos], layout.stride[spatial_pos + 0], + layout.stride[spatial_pos + 1])); + } +} + +template +FilterDesc::FilterDesc() { + cudnn_check(cudnnCreateFilterDescriptor(&desc)); +} + +template +FilterDesc::~FilterDesc() { + cudnn_check(cudnnDestroyFilterDescriptor(desc)); +} + +template +void FilterDesc::set( + const typename ConvolutionBase::CanonizedFilterMeta& + filter_meta) { + megdnn_assert(filter_meta.spatial_ndim == 2); +#if CUDNN_VERSION < 7500 + megdnn_assert(filter_meta.dilation[0] == 1 && filter_meta.dilation[1] == 1); +#endif +#if CUDNN_MAJOR <= 6 + megdnn_assert(filter_meta.group == 1); +#endif + + // cuDNN version 6 or below filter_meta.group always is 1. + // So it is compatible for all cuDNN versions. + cudnn_check(cudnnSetFilter4dDescriptor( + desc, to_cudnn_dtype(filter_meta.dtype, filter_meta.format), + to_cudnn_format(filter_meta.format), + filter_meta.ocpg * filter_meta.group, // cudnn 6 group always be 1 + filter_meta.icpg, filter_meta.spatial[0], filter_meta.spatial[1])); +} + +template class FilterDesc; +template class FilterDesc; + +ConvDesc::ConvDesc() { + cudnn_check(cudnnCreateConvolutionDescriptor(&desc)); +#if CUDNN_VERSION >= 7000 + // cudnn enables tensor core when tensors have dataType = + // CUDNN_DATA_HALF, so it should be safe to enable globally + cudnn_check(cudnnSetConvolutionMathType(desc, CUDNN_TENSOR_OP_MATH)); +#endif +} + +ConvDesc::~ConvDesc() { + cudnn_check(cudnnDestroyConvolutionDescriptor(desc)); +} + +void ConvDesc::set(DType data_type, const param::Convolution& param, + const size_t nr_group) { + using Param = param::Convolution; + cudnnConvolutionMode_t mode; + switch (param.mode) { + case Param::Mode::CROSS_CORRELATION: + mode = CUDNN_CROSS_CORRELATION; + break; + case Param::Mode::CONVOLUTION: + mode = CUDNN_CONVOLUTION; + break; + default: + megdnn_throw(megdnn_mangle("conv mode must be conv or xcorr.")); + } + cudnnDataType_t compute_type; + MEGDNN_MARK_USED_VAR(compute_type); + if (data_type.enumv() == DTypeEnum::Float32) { + // FLOAT_CONFIG + compute_type = CUDNN_DATA_FLOAT; + } else if (data_type.enumv() == DTypeEnum::Float16) { + auto comp_mode = param.compute_mode; + compute_type = get_compute_type_fp16(comp_mode); +#if CUDNN_MAJOR >= 7 + } else if (data_type.category() == DTypeCategory::INT || + data_type.category() == DTypeCategory::QUANTIZED) { + compute_type = CUDNN_DATA_INT32; +#endif + } else { + megdnn_throw(megdnn_mangle("unspport data type for conv bias")); + } +#if CUDNN_MAJOR >= 7 + cudnn_check(cudnnSetConvolutionGroupCount(desc, nr_group)); +#else + megdnn_assert(nr_group == 1); +#endif + +#if CUDNN_MAJOR >= 6 + cudnn_check(cudnnSetConvolution2dDescriptor( + desc, param.pad_h, param.pad_w, param.stride_h, param.stride_w, + param.dilate_h, param.dilate_w, mode, compute_type)); +#else + cudnn_check(cudnnSetConvolution2dDescriptor( + desc, param.pad_h, param.pad_w, param.stride_h, param.stride_w, + param.dilate_h, param.dilate_w, mode)); +#endif +} + +PoolingDesc::PoolingDesc() { + cudnn_check(cudnnCreatePoolingDescriptor(&desc)); +} + +PoolingDesc::~PoolingDesc() { + cudnn_check(cudnnDestroyPoolingDescriptor(desc)); +} + +void PoolingDesc::set(const param::Pooling& param) { + cudnnPoolingMode_t mode; + switch (param.mode) { + case param::Pooling::Mode::MAX: + mode = CUDNN_POOLING_MAX; + break; + case param::Pooling::Mode::AVERAGE: + mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + break; + } + cudnn_check(cudnnSetPooling2dDescriptor( + desc, mode, CUDNN_NOT_PROPAGATE_NAN, param.window_h, param.window_w, + param.pad_h, param.pad_w, param.stride_h, param.stride_w)); +} + +LRNDesc::LRNDesc() { + cudnn_check(cudnnCreateLRNDescriptor(&desc)); +} + +LRNDesc::~LRNDesc() { + cudnn_check(cudnnDestroyLRNDescriptor(desc)); +} + +void LRNDesc::set(const param::LRN& param) { + megdnn_assert(param.n & 1, "n is %u", param.n); + megdnn_assert(param.n >= CUDNN_LRN_MIN_N, "n is %u, CUDNN_LRN_MIN_N is %d", + param.n, CUDNN_LRN_MIN_N); + megdnn_assert(param.n <= CUDNN_LRN_MAX_N, "n is %u, CUDNN_LRN_MAX_N is %d", + param.n, CUDNN_LRN_MAX_N); + megdnn_assert(param.k >= CUDNN_LRN_MIN_K, "k is %f, CUDNN_LRN_MIN_K is %lf", + param.k, CUDNN_LRN_MIN_K); + megdnn_assert(param.beta >= CUDNN_LRN_MIN_BETA, + "beta is %f, CUDNN_LRN_MIN_BETA is %lf", param.beta, + CUDNN_LRN_MIN_BETA); + // Note that alpha is divided by n in the cudnn implementation, + // so we have to multiply alpha by n ahead of time. + cudnn_check(cudnnSetLRNDescriptor(desc, param.n, param.alpha * param.n, + param.beta, param.k)); +} + +BNParamDesc::BNParamDesc() { + cudnn_check(cudnnCreateTensorDescriptor(&desc)); +} + +void BNParamDesc::set(const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode) { + cudnn_check(cudnnDeriveBNTensorDescriptor(desc, xDesc, mode)); +} + +BNParamDesc::~BNParamDesc() { + cudnn_check(cudnnDestroyTensorDescriptor(desc)); +} + +Tensor3DDesc::Tensor3DDesc() { + cudnn_check(cudnnCreateTensorDescriptor(&desc)); +} + +Tensor3DDesc::~Tensor3DDesc() { + cudnn_check(cudnnDestroyTensorDescriptor(desc)); +} + +int sc(const size_t x) { + return static_cast(x); +} +void Tensor3DDesc::set(const TensorLayout& layout, bool is_ndhwc) { + megdnn_assert_eq_size_t(layout.ndim, 5_z); + size_t c_pos, spatial_pos; + if (is_ndhwc) { + c_pos = 4; + spatial_pos = 1; + } else { // ncdhw + c_pos = 1; + spatial_pos = 2; + } + const int dimA[] = {sc(layout.shape[0]), sc(layout.shape[c_pos]), + sc(layout.shape[spatial_pos + 0]), + sc(layout.shape[spatial_pos + 1]), + sc(layout.shape[spatial_pos + 2])}; + + const int strideA[] = {sc(layout.stride[0]), sc(layout.stride[c_pos]), + sc(layout.stride[spatial_pos + 0]), + sc(layout.stride[spatial_pos + 1]), + sc(layout.stride[spatial_pos + 2])}; + + cudnn_check(cudnnSetTensorNdDescriptor(desc, to_cudnn_dtype(layout.dtype), + 5, dimA, strideA)); +} + +Filter3DDesc::Filter3DDesc() { + cudnn_check(cudnnCreateFilterDescriptor(&desc)); +} + +Filter3DDesc::~Filter3DDesc() { + cudnn_check(cudnnDestroyFilterDescriptor(desc)); +} + +void Filter3DDesc::set( + const Convolution3DBase::CanonizedFilterMeta& filter_meta) { + megdnn_assert(filter_meta.spatial_ndim == 3); +#if CUDNN_MAJOR <= 6 + megdnn_assert(filter_meta.group == 1); +#endif + + // cuDNN version 6 or below filter_meta.group always is 1. + // So it is compatible for all cuDNN versions. + const int filterDimA[] = { + sc(filter_meta.ocpg * + filter_meta.group), // cudnn 6 group always be 1 + sc(filter_meta.icpg), sc(filter_meta.spatial[0]), + sc(filter_meta.spatial[1]), sc(filter_meta.spatial[2])}; + + cudnn_check(cudnnSetFilterNdDescriptor( + desc, to_cudnn_dtype(DType::from_enum(filter_meta.dtype_enum)), + CUDNN_TENSOR_NCHW, 5, filterDimA)); +} + +Conv3DDesc::Conv3DDesc() { + cudnn_check(cudnnCreateConvolutionDescriptor(&desc)); + +#if CUDNN_MAJOR >= 7 + // cudnn enables tensor core when tensors have dataType = CUDNN_DATA_HALF, + // so it should be safe to enable globally + cudnn_check(cudnnSetConvolutionMathType(desc, CUDNN_TENSOR_OP_MATH)); +#endif +} + +Conv3DDesc::~Conv3DDesc() { + cudnn_check(cudnnDestroyConvolutionDescriptor(desc)); +} + +void Conv3DDesc::set(const param::Convolution3D& param, const size_t nr_group) { + cudnnConvolutionMode_t mode; + switch (param.mode) { + case param::Convolution3D::Mode::CROSS_CORRELATION: + mode = CUDNN_CROSS_CORRELATION; + break; + case param::Convolution3D::Mode::CONVOLUTION: + mode = CUDNN_CONVOLUTION; + break; + default: + megdnn_throw(megdnn_mangle("conv mode must be conv or xcorr.")); + } +#if CUDNN_MAJOR >= 7 + cudnn_check(cudnnSetConvolutionGroupCount(desc, nr_group)); +#else + megdnn_assert(nr_group == 1); +#endif + + const int padA[] = {sc(param.pad_d), sc(param.pad_h), sc(param.pad_w)}, + filterStrideA[] = {sc(param.stride_d), sc(param.stride_h), + sc(param.stride_w)}, + dilationA[] = {sc(param.dilate_d), sc(param.dilate_h), + sc(param.dilate_w)}; + // not use true half + // in CUDNN_MAJOR < 6, all elements in dilA shoule be 1 + cudnn_check(cudnnSetConvolutionNdDescriptor( + desc, 3, padA, filterStrideA, dilationA, mode, CUDNN_DATA_FLOAT)); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cudnn_wrapper.h b/dnn/src/cuda/cudnn_wrapper.h new file mode 100644 index 00000000..c4ada5d2 --- /dev/null +++ b/dnn/src/cuda/cudnn_wrapper.h @@ -0,0 +1,111 @@ +/** + * \file dnn/src/cuda/cudnn_wrapper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/basic_types.h" +#include "megdnn/oprs/nn.h" +#include "src/cuda/cudnn_with_check.h" + +namespace megdnn { +namespace cuda { + +/*! + * \brief get compute_type of convolution operations + */ +cudnnDataType_t get_compute_type_fp16( + param::Convolution::ComputeMode comp_mode); + +class TensorDesc { + public: + TensorDesc(); + //! default layout is nchw + void set(const TensorLayout& layout, const param::Convolution::Format = + param::Convolution::Format::NCHW); + ~TensorDesc(); + cudnnTensorDescriptor_t desc; +}; + +template +class FilterDesc { + public: + FilterDesc(); + void set(const typename ConvolutionBase::CanonizedFilterMeta &meta); + ~FilterDesc(); + cudnnFilterDescriptor_t desc; +}; + +class ConvDesc { + public: + ConvDesc(); + void set(DType data_type, const param::Convolution& param, + const size_t nr_group); + ~ConvDesc(); + cudnnConvolutionDescriptor_t desc; +}; + +class PoolingDesc { + public: + PoolingDesc(); + void set(const param::Pooling ¶m); + ~PoolingDesc(); + cudnnPoolingDescriptor_t desc; +}; + +class LRNDesc { + public: + LRNDesc(); + void set(const param::LRN ¶m); + ~LRNDesc(); + cudnnLRNDescriptor_t desc; +}; + + +class BNParamDesc { + public: + BNParamDesc(); + void set(const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode); + ~BNParamDesc(); + cudnnTensorDescriptor_t desc; +}; + +// the classes below is used to deal with 3d situations +class Tensor3DDesc { + public: + Tensor3DDesc(); + //! default layout is NCDHW + void set(const TensorLayout &layout, bool is_ndhwc = false); + ~Tensor3DDesc(); + cudnnTensorDescriptor_t desc; +}; + +class Filter3DDesc { + public: + Filter3DDesc(); + void set(const Convolution3DBase::CanonizedFilterMeta &meta); + ~Filter3DDesc(); + cudnnFilterDescriptor_t desc; +}; + +class Conv3DDesc { + public: + Conv3DDesc(); + void set(const param::Convolution3D ¶m, const size_t nr_group); + ~Conv3DDesc(); + cudnnConvolutionDescriptor_t desc; +}; + + + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cumsum/cumsum.cu b/dnn/src/cuda/cumsum/cumsum.cu new file mode 100644 index 00000000..d62e35a3 --- /dev/null +++ b/dnn/src/cuda/cumsum/cumsum.cu @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/cumsum/cumsum.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern_impl.cuinl" + +namespace megdnn { +namespace cuda { +namespace cumsum { + +#define INST_(T, Op, exclusive, reverse) \ + template void run_kern( \ + T*, void*, uint32_t, uint32_t, uint32_t, uint32_t, const Op&, \ + cudaStream_t) +#define INST(T) \ + INST_(T, SumOp, true, true); \ + INST_(T, SumOp, false, true); \ + INST_(T, SumOp, true, false); \ + INST_(T, SumOp, false, false); + +#define cb(DType) INST(typename DTypeTrait::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +} // namespace cumsum +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/cumsum/kern.cuh b/dnn/src/cuda/cumsum/kern.cuh new file mode 100644 index 00000000..34bc5e0a --- /dev/null +++ b/dnn/src/cuda/cumsum/kern.cuh @@ -0,0 +1,73 @@ +/** + * \file dnn/src/cuda/cumsum/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/utils.cuh" + +#include +#include + +namespace megdnn { +namespace cuda { +namespace cumsum { + +//! compute conventional sum of elements +template +struct SumOp { + const T* data; + typedef SumOp ContigOp; + + SumOp(const T* d) : data(d) {} + + __host__ __device__ static T init() { return T(0); } + __device__ static T apply(T lhs, T rhs) { return lhs + rhs; } + __device__ T visit(uint32_t idx) const { return data[idx]; } + + static SumOp make_contig(const T* data) { return SumOp(data); } +}; + +/*! + * \brief cumsum kernel launcher; defined in kern_impl.cuinl + * \tparam T output data type + * \tparam Op reduction operator class, which must provide following interface: + * typdef ContigOp + * static T init(): the identity element + * static T apply(T lhs, T rhs): the reduction operation + * T visit(uint32_t idx) const: access input + * static ContigOp make_contig(const T *data): make an Oo to continue + * reduction on temp buffer + * + * Note that Op::init() must be accessible from both host and device. + * + * In exclusive mode, Op::init() would be filled to the boundary + * + * The buffer in *op* and *dst* should not have identical memory addresses. + */ +template +void run_kern(T* dst, void* workspace, uint32_t workspace_size, uint32_t A, + uint32_t B, uint32_t C, const Op& op, cudaStream_t stream); + +/*! + * \brief get required workspace size for cumsum, in bytes + * \param item_size size of item; i.e. sizeof(T) in run_kern + * + * Note: cuda device must be set to the computing device before calling this + * function. + */ +uint32_t get_workspace_in_bytes(uint32_t A, uint32_t B, uint32_t C, + uint32_t item_size); + +} // namespace cumsum +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/cumsum/kern_helper.cuh b/dnn/src/cuda/cumsum/kern_helper.cuh new file mode 100644 index 00000000..48781836 --- /dev/null +++ b/dnn/src/cuda/cumsum/kern_helper.cuh @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/cumsum/kern_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include + +namespace megdnn { +namespace cuda { +namespace cumsum { + +void get_BX_BY(uint32_t A, uint32_t B, uint32_t C, uint32_t& BX, uint32_t& BY); + +uint32_t get_workspace_bytes_for_cub_1d(uint32_t nr_item, uint32_t item_size); + +} // namespace cumsum +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/cumsum/kern_impl.cu b/dnn/src/cuda/cumsum/kern_impl.cu new file mode 100644 index 00000000..cd6414c0 --- /dev/null +++ b/dnn/src/cuda/cumsum/kern_impl.cu @@ -0,0 +1,93 @@ +/** + * \file dnn/src/cuda/cumsum/kern_impl.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" +#include "./kern_impl.cuinl" + +using namespace megdnn::cuda; +using namespace cumsum::detail::cubwrap; + +namespace { + +template +struct FakeOp { + __device__ T visit(int) { return 0; } + __device__ static T apply(T, T) { return 0; } +}; + +template +uint32_t get_workspace_elems_for_cub_1d_with_dtype_reverse(uint32_t nr_item) { + typedef FakeOp Op; + Op op; + InputIterator inp_iter(op, nr_item); + OutputIterator out_iter(NULL, nr_item); + ScanOp scan_op; + + size_t wk_size0 = 0, wk_size1 = 0; + cuda_check(cub::DeviceScan::ExclusiveScan(NULL, wk_size0, inp_iter, + out_iter, scan_op, 0, nr_item)); + cuda_check(cub::DeviceScan::InclusiveScan(NULL, wk_size1, inp_iter, + out_iter, scan_op, nr_item)); + return std::max(wk_size0, wk_size1); +} + +template +uint32_t get_workspace_elems_for_cub_1d_with_dtype(uint32_t nr_item) { + return std::max(get_workspace_elems_for_cub_1d_with_dtype_reverse( + nr_item), + get_workspace_elems_for_cub_1d_with_dtype_reverse( + nr_item)); +} + +} // namespace + +uint32_t cumsum::get_workspace_bytes_for_cub_1d(uint32_t nr_item, + uint32_t item_size) { + switch (item_size) { +#define CASE(size, type) \ + case size: \ + return get_workspace_elems_for_cub_1d_with_dtype(nr_item) + CASE(1, uint8_t); + CASE(2, uint16_t); + CASE(4, uint32_t); + CASE(8, uint64_t); +#undef CASE + default: + report_error(megdnn_mangle("unsupported item size in cumsum")); + } +} + +uint32_t cumsum::get_workspace_in_bytes(uint32_t A, uint32_t B, uint32_t C, + uint32_t item_size) { + if (A == 1 && C == 1) { + return get_workspace_bytes_for_cub_1d(B, item_size); + } + uint32_t BX, BY; + get_BX_BY(A, B, C, BX, BY); + uint32_t BY2 = BY * 2; + uint32_t res = 0; + while (B > BY2) { + B = (B + BY2 - 1) / BY2; + res += A * B * C; + } + return res * item_size; +} + +void cumsum::get_BX_BY(uint32_t /* A */, uint32_t /* B */, uint32_t C, + uint32_t& BX, uint32_t& BY) { + BX = 1; + while (BX < C && BX * 2 <= 32) + BX *= 2; + BY = 512 / BX; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cumsum/kern_impl.cuinl b/dnn/src/cuda/cumsum/kern_impl.cuinl new file mode 100644 index 00000000..0620de6d --- /dev/null +++ b/dnn/src/cuda/cumsum/kern_impl.cuinl @@ -0,0 +1,337 @@ +/** + * \file dnn/src/cuda/cumsum/kern_impl.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "./kern_helper.cuh" +#include "megdnn/dtype.h" +#include "src/cuda/cub/device/device_scan.cuh" +#include "src/cuda/cub/util_ptx.cuh" + +namespace megdnn { +namespace cuda { +namespace cumsum { +namespace detail { + +/** + * src shape is (A, B, C), performing blockwise scan over B axis. + * Each CUDA block calculates a blockwise scan result of size (BY2, BX). + * The block area corresponds to a 2-D area on (B, C) dimension of src. + * + * Per-block prefix sum is stored in dst (dst has the same shape as src). + * + * The whole scan result of each block as a single value is stored in + * block_sum (of shape (A, B/BY2, C)). + * + * block_sum can be NULL. + * + * src and dst can be inplace. + * + * We need to launch (C/BX)*ceil(B/BY2)*A blocks in total. + * Because in CUDA the number of launched blocks over y and z axis are + * limited (at most 65535), we launch all blocks over axis x. + * + * Param: exclusive + * This flag specifies whether the scan is inclusive or exclusive, namely + * whether src_i influences dst_i. + * + * Param: reverse: + * This flag specifies whether the scan is forward or backward. + * + * Example: + * !exclusive && !reverse: dst_i = op(src_0, src_1, ..., src_i) + * !exclusive && reverse: dst_i = op(src_i, src_{i+1}, ..., src_{n-1}) + * exclusive && !reverse: dst_i = op(src_0, src_1, ..., src{i-1}) + * exclusive && reverse: dst_i = op(src_{i+1}, src{i+2}, ..., src{n-1}) + * + * Op should have the following methods: + * static T init() + * static T apply(T lhs, T rhs) + */ +template +__global__ void scan_kernel(T *dst, T *block_sum, + uint32_t A, uint32_t B, uint32_t C, const Op op) { + constexpr size_t warp_size = 32; + const uint32_t BY2 = BY*2; + const uint32_t B_ = (B+BY2-1) / BY2; + const uint32_t C_ = (C+BX-1) / BX; + const uint32_t GX = C_; + const uint32_t GY = B_; + // src, dst: (A, B, C) + // block_sum: (A, B_, C) + // shared: (BY2+1, BX) + const uint32_t bx = blockIdx.x % GX; + const uint32_t by = blockIdx.x / GX % GY; + const uint32_t bz = blockIdx.x / GX / GY; + const uint32_t tx = threadIdx.x; + const uint32_t ty = threadIdx.y; + // TODO: shared memory bank conflict optimization +#define shared_idx(x) ((x) + ((x) >> 5)) + volatile __shared__ T cache[shared_idx((BY2+1)*BX)]; + uint32_t base_offset = (bz)*B*C + (by*BY2)*C + (bx*BX); + dst += base_offset; + // load to cache + if (reverse) { + cache[shared_idx((BY2-ty)*BX+tx)] = ty+by*BY2 < B && tx+bx*BX < C ? + op.visit(base_offset + ty*C + tx) : Op::init(); + } else { + cache[shared_idx((ty+1)*BX+tx)] = ty+by*BY2 < B && tx+bx*BX < C ? + op.visit(base_offset + ty*C + tx) : Op::init(); + } + if (reverse) { + cache[shared_idx((BY-ty)*BX+tx)] = + (ty+BY) + by*BY2 < B && tx+bx*BX < C ? + op.visit(base_offset + (ty+BY)*C + tx) : Op::init(); + } else { + cache[shared_idx((ty+BY+1)*BX+tx)] = + (ty+BY) + by*BY2 < B && tx+bx*BX < C ? + op.visit(base_offset + (ty+BY)*C + tx) : Op::init(); + } + if (ty == 0) { + cache[shared_idx(tx)] = Op::init(); + } + __syncthreads(); + uint32_t total, stride; + // first pass +#pragma unroll + for (total = BY, stride = 1; + total > 0; + total >>= 1, stride <<= 1) + { + if (ty < total) { + uint32_t ai = shared_idx(stride * (2*ty+1) * BX + tx); + uint32_t bi = shared_idx(stride * (2*ty+2) * BX + tx); + cache[bi] = Op::apply(cache[bi], cache[ai]); + } + if (total > warp_size/BX) __syncthreads(); + else cub::WARP_SYNC(0xffffffff); + } + // second pass +#pragma unroll + for (total = 1, stride = BY; + stride > 0; + total <<= 1, stride >>= 1) + { + if (total > warp_size/BX) __syncthreads(); + else cub::WARP_SYNC(0xffffffff); + if (ty < total) { + uint32_t ai = shared_idx(stride * (2*ty+0) * BX + tx); + uint32_t bi = shared_idx(stride * (2*ty+1) * BX + tx); + cache[bi] = Op::apply(cache[bi], cache[ai]); + } + } + __syncthreads(); + uint32_t ty_offset = (exclusive ? 0 : 1); + if (ty+by*BY2 < B && tx+bx*BX < C) { + if (reverse) { + dst[ty*C + tx] = cache[shared_idx((BY2-1-ty+ty_offset)*BX + tx)]; + } else { + dst[ty*C + tx] = cache[shared_idx((ty+ty_offset)*BX + tx)]; + } + } + if (ty+BY+by*BY2 < B && tx+bx*BX < C) { + if (reverse) { + dst[(ty+BY)*C + tx] = + cache[shared_idx((BY2-1-(ty+BY)+ty_offset)*BX + tx)]; + } else { + dst[(ty+BY)*C + tx] = + cache[shared_idx((ty+BY+ty_offset)*BX + tx)]; + } + } + if (block_sum && ty == 0 && bx*BX+tx < C) { + block_sum[(bz)*B_*C + (by)*C + (bx*BX) + tx] = + cache[shared_idx(BY2*BX + tx)]; + } +} + +template +__global__ void update_kernel(T *dst, const T *delta, + uint32_t A, uint32_t B, uint32_t C) { + const uint32_t BY2 = BY*2; + const uint32_t B_ = (B+BY2-1) / BY2; + const uint32_t C_ = (C+BX-1) / BX; + const uint32_t GX = C_; + const uint32_t GY = B_; + // src: (A, B, C) + // delta: (A, B_, C) + const uint32_t bx = blockIdx.x % GX; + const uint32_t by = blockIdx.x / GX % GY; + const uint32_t bz = blockIdx.x / GX / GY; + const uint32_t tx = threadIdx.x; + const uint32_t ty = threadIdx.y; + + if (tx + bx*BX < C) { + T delta_v = delta[(bz)*B_*C + (by)*C + (bx*BX) + tx]; + if (ty+by*BY2 < B && tx+bx*BX < C) { + T &res = dst[bz*B*C + (ty+by*BY2)*C + (tx+bx*BX)]; + res = Op::apply(res, delta_v); + } + if (ty+BY+by*BY2 < B && tx+bx*BX < C) { + T &res = dst[bz*B*C + (ty+BY+by*BY2)*C + (tx+bx*BX)]; + res = Op::apply(res, delta_v); + } + } +} + +template +void run_kern_multiAC(T* dst, T* workspace, uint32_t A, uint32_t B, + uint32_t C, const Op& op, cudaStream_t stream); + +template +void do_run_kern(T *dst, T *workspace, + uint32_t A, uint32_t B, uint32_t C, const Op &op, cudaStream_t stream) { + const uint32_t BY2 = BY*2; + const uint32_t B_ = (B+BY2-1)/BY2; + const uint32_t C_ = (C+BX-1)/BX; + + dim3 blocks(C_*B_*A); + dim3 threads(BX, BY); + + scan_kernel + <<>>( + dst, B > BY2 ? workspace : NULL, A, B, C, op); + if (B <= BY2) + return; + + run_kern_multiAC( + workspace, workspace + A*B_*C, A, B_, C, + Op::make_contig(workspace), stream); + update_kernel<<>>( + dst, workspace, A, B, C); +} + +template +void run_kern_multiAC(T* dst, T* workspace, uint32_t A, uint32_t B, uint32_t C, + const Op& op, cudaStream_t stream) { +#define IF(BX, BY) \ + do { \ + if (vBX == BX && vBY == BY) { \ + return do_run_kern( \ + dst, workspace, A, B, C, op, stream); \ + } \ + } while (0) + + uint32_t vBX, vBY; + get_BX_BY(A, B, C, vBX, vBY); + IF(1, 512); + IF(2, 256); + IF(4, 128); + IF(8, 64); + IF(16, 32); + IF(32, 16); + megdnn_trap(); +#undef IF +} + +//! wrap cub library for 1-dim scan +namespace cubwrap { + +template +class InputIterator : public std::iterator { + int m_offset, m_len; + Op m_op; + +public: + InputIterator(Op op, int len) : m_offset(0), m_len(len), m_op(op) {} + + __device__ InputIterator(int offset, int len, Op op) + : m_offset(offset), m_len(len), m_op(op) {} + + __device__ T operator[](int idx) { + idx += m_offset; + if (reverse) { + idx = m_len - 1 - idx; + } + return m_op.visit(idx); + } + + __device__ InputIterator operator+(int offset) { + return InputIterator(m_offset + offset, m_len, m_op); + } +}; + +template +class OutputIterator + : public std::iterator { + int m_offset, m_len; + T* m_dst; + +public: + OutputIterator(T* dst, int len) : m_offset(0), m_len(len), m_dst(dst) {} + + __device__ OutputIterator(int offset, int len, T* dst) + : m_offset(offset), m_len(len), m_dst(dst) {} + + __device__ T& operator[](int idx) { + idx += m_offset; + if (reverse) { + idx = m_len - 1 - idx; + } + return m_dst[idx]; + } + + __device__ OutputIterator operator+(int offset) { + return OutputIterator(m_offset + offset, m_len, m_dst); + } +}; + +template +struct ScanOp { + __device__ __host__ T operator()(T a, T b) { + // cub requires it to be a __device__ __host__ function but MegDNN has + // no such contraint on Op::apply; so we just trap on host +#ifdef __CUDA_ARCH__ + return Op::apply(a, b); +#else + megdnn_trap(); +#endif + } +}; + +template +void invoke(T* dst, void* workspace, size_t wk_size, const Op& op, uint32_t len, + cudaStream_t stream) { + InputIterator inp_iter(op, len); + OutputIterator out_iter(dst, len); + ScanOp scan_op; + + if (exclusive) { + cuda_check(cub::DeviceScan::ExclusiveScan(workspace, wk_size, inp_iter, + out_iter, scan_op, Op::init(), + len, stream)); + } else { + cuda_check(cub::DeviceScan::InclusiveScan( + workspace, wk_size, inp_iter, out_iter, scan_op, len, stream)); + } +} +} // namespace cubwrap + +} // namespace detail + +template +void run_kern(T* dst, void* workspace, uint32_t workspace_size, uint32_t A, + uint32_t B, uint32_t C, const Op& op, cudaStream_t stream) { + if (A == 1 && C == 1) { + return detail::cubwrap::invoke( + dst, workspace, workspace_size, op, B, stream); + } + + return detail::run_kern_multiAC( + dst, static_cast(workspace), A, B, C, op, stream); +} + +} // namespace cumsum +} // namespace cuda +} // namespace megdnn + + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/cumsum/opr_impl.cpp b/dnn/src/cuda/cumsum/opr_impl.cpp new file mode 100644 index 00000000..75047037 --- /dev/null +++ b/dnn/src/cuda/cumsum/opr_impl.cpp @@ -0,0 +1,75 @@ +/** + * \file dnn/src/cuda/cumsum/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./kern.cuh" + +#include "src/common/reduce_helper.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace cumsum; + +namespace { + +/*! + * \brief compute cumsum reduction on (A, B, C) tensor to (A, 1, C) + */ +template +void dispatch(T* dst, T* workspace, size_t workspace_size, size_t A, size_t B, + size_t C, bool exclusive, bool reverse, const Op& op, + cudaStream_t stream) { +#define IF(exclusive_v, reverse_v) \ + if (exclusive == exclusive_v && reverse == reverse_v) { \ + run_kern( \ + dst, workspace, workspace_size, A, B, C, op, stream); \ + return; \ + } + IF(true, true) + IF(true, false) + IF(false, true) + IF(false, false) + megdnn_assert_internal(false); +#undef IF +} + +} // anonymous namespace + +void CumsumForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, + _megdnn_workspace workspace) { + check_exec(src.layout, dst.layout, workspace.size); + size_t A, B, C; + reduce::get_ABC(src.layout, A, B, C, param().axis); + auto stream = cuda_stream(handle()); +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = DTypeTrait::ctype; \ + dispatch>( \ + dst.ptr(), workspace.ptr(), workspace.size, A, \ + B, C, param().exclusive, param().reverse, src.ptr(), \ + stream); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(false); +} + +size_t CumsumForwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout&) { + size_t A, B, C; + reduce::get_ABC(src, A, B, C, param().axis); + cuda_check(cudaSetDevice(concrete_handle(handle())->device_id())); + return cumsum::get_workspace_in_bytes(A, B, C, src.dtype.size()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cumsum/opr_impl.h b/dnn/src/cuda/cumsum/opr_impl.h new file mode 100644 index 00000000..c7114d1b --- /dev/null +++ b/dnn/src/cuda/cumsum/opr_impl.h @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/cumsum/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class CumsumForwardImpl: public CumsumForward { + public: + using CumsumForward::CumsumForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &dst) override; +}; + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cv/kernel_common.cuh b/dnn/src/cuda/cv/kernel_common.cuh new file mode 100644 index 00000000..f74fe2e5 --- /dev/null +++ b/dnn/src/cuda/cv/kernel_common.cuh @@ -0,0 +1,238 @@ +/** + * \file dnn/src/cuda/cv/kernel_common.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/common/cv/enums.h" + +#include "megdnn/basic_types.h" + +#include +#include +#include +#include +#include + +typedef unsigned char uchar; +typedef unsigned char byte; + +namespace megdnn { +namespace megcv { + +// FIXME the implement is not the same as in the cv/help.h +template +__host__ __device__ T saturate(const T x, const T lower, const T upper) { + if (x < lower) + return lower; + if (x > upper) + return upper; + return x; +} + +__device__ inline int saturate_cast(double val) { + return round(val); +} + +__device__ inline short saturate_cast_short(double x) { + return x < -32768 ? -32768 : (x > 32767 ? 32767 : round(x)); +} + +__device__ inline void interpolate_linear_coefs(float x, float* coeffs) { + coeffs[0] = 1 - x; + coeffs[1] = x; +} + +__host__ __device__ inline void interpolate_cubic_coefs(float x, + float* coeffs) { + const float A = -0.75f; + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +__device__ inline void interpolate_lanczos4_coefs(float x, float* coeffs) { + const float s45 = 0.70710678118654752440084436210485; + const float cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, + {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; + const float MEGCV_PI = 3.1415926536; + + if (x < FLT_EPSILON) { + for (int i = 0; i < 8; i++) + coeffs[i] = 0; + coeffs[3] = 1; + return; + } + + float sum = 0; + float y0 = -(x + 3) * MEGCV_PI * 0.25, s0 = sin(y0), c0 = cos(y0); + for (int i = 0; i < 8; i++) { + float y = -(x + 3 - i) * MEGCV_PI * 0.25; + coeffs[i] = (float)((cs[i][0] * s0 + cs[i][1] * c0) / (y * y)); + sum += coeffs[i]; + } + + sum = 1.f / sum; + for (int i = 0; i < 8; i++) + coeffs[i] *= sum; +} + +template +class BModeTrait { +public: + static const BorderMode bmode1 = bmode; +}; +template <> +class BModeTrait { +public: + static const BorderMode bmode1 = BORDER_REFLECT_101; +}; + +template +class TypeTrait { +public: + typedef T WorkType; + MEGDNN_DEVICE static T min() { return std::numeric_limits::min(); } + MEGDNN_DEVICE static T max() { return std::numeric_limits::max(); } + static const bool need_saturate; +}; +template <> +class TypeTrait { +public: + typedef int WorkType; + MEGDNN_DEVICE static uchar min() { return 0; } + MEGDNN_DEVICE static uchar max() { return 255; } + static const bool need_saturate = true; +}; +template <> +class TypeTrait { +public: + typedef float WorkType; + MEGDNN_DEVICE static float min() { return 0; } + MEGDNN_DEVICE static float max() { return 1; } + static const bool need_saturate = false; +}; + +template +__device__ inline int border_interpolate(int p, int len); + +template <> +__device__ inline int border_interpolate(int p, int len) { + if ((unsigned)p >= (unsigned)len) { + p = p < 0 ? 0 : len - 1; + } + return p; +} + +template <> +__device__ inline int border_interpolate(int p, int len) { + if (len == 1) + return 0; + + do { + if (p < 0) + p = -p - 1; + else + p = len - 1 - (p - len); + } while ((unsigned)p >= (unsigned)len); + return p; +} + +template <> +__device__ inline int border_interpolate(int p, int len) { + if (len == 1) + return 0; + + do { + if (p < 0) + p = -p; + else + p = len - 1 - (p - len) - 1; + } while ((unsigned)p >= (unsigned)len); + return p; +} + +template <> +__device__ inline int border_interpolate(int p, int len) { + if ((unsigned)p >= (unsigned)len) { + if (p < 0) + p -= ((p - len + 1) / len) * len; + + p %= len; + } + return p; +} + +template <> +__device__ inline int border_interpolate(int p, int len) { + if ((unsigned)p >= (unsigned)len) { + p = -1; + } + return p; +} + +template <> +__device__ inline int border_interpolate(int p, int len) { + // if ((unsigned)p >= (unsigned)len) { + // p = -1; + //} + return (unsigned)p >= (unsigned)len ? -1 : p; +} + +template +__device__ void interpolate_coefs(float x, float* coeffs); +template <> +__device__ inline void interpolate_coefs(float x, + float* coeffs) {} +template <> +__device__ inline void interpolate_coefs(float x, float* coeffs) { + interpolate_linear_coefs(x, coeffs); +} +template <> +__device__ inline void interpolate_coefs(float x, float* coeffs) { + interpolate_cubic_coefs(x, coeffs); +} +template <> +__device__ inline void interpolate_coefs(float x, + float* coeffs) { + interpolate_lanczos4_coefs(x, coeffs); +} + +template +class IModeTrait { +public: + static const int ksize; +}; +template <> +class IModeTrait { +public: + static const int ksize = 1; +}; +template <> +class IModeTrait { +public: + static const int ksize = 2; +}; + +template <> +class IModeTrait { +public: + static const int ksize = 4; +}; +template <> +class IModeTrait { +public: + static const int ksize = 8; +}; + +} // namespace megcv +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cvt_color/cvt_color.cu b/dnn/src/cuda/cvt_color/cvt_color.cu new file mode 100644 index 00000000..8ec46fbd --- /dev/null +++ b/dnn/src/cuda/cvt_color/cvt_color.cu @@ -0,0 +1,767 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/cuda/cvt_color/cvt_color.cu + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ + +#include "src/common/opr_param_defs_enumv.cuh" +#include "src/cuda/cv/kernel_common.cuh" +#include "src/cuda/cvt_color/cvt_color.cuh" +#include "src/cuda/utils.cuh" + +#include +#include +#include + +namespace megdnn { +namespace cuda { +namespace cvt_color { + +using namespace megcv; + +#define THREADS_X 256 +#define THREADS_Y 1 + +#define U8_PROCESS_PER_THREADS_X 4 +#define F32_PROCESS_PER_THREADS_X 1 + +__global__ void cvt_rgb2gray_8u_kernel(const uchar* src, uchar* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + if (t < (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t offset = t * U8_PROCESS_PER_THREADS_X; + src += 3 * offset; + dst += 1 * offset; + + uchar temp_des[4]; + uchar temp_src[12]; + *((uint3*)temp_src) = *((uint3*)src); + + temp_des[0] = (temp_src[0] * 4899 + temp_src[1] * 9617 + + temp_src[2] * 1868 + (1 << 13)) >> + 14; + temp_des[1] = (temp_src[3] * 4899 + temp_src[4] * 9617 + + temp_src[5] * 1868 + (1 << 13)) >> + 14; + temp_des[2] = (temp_src[6] * 4899 + temp_src[7] * 9617 + + temp_src[8] * 1868 + (1 << 13)) >> + 14; + temp_des[3] = (temp_src[9] * 4899 + temp_src[10] * 9617 + + temp_src[11] * 1868 + (1 << 13)) >> + 14; + + *((uint32_t*)dst) = *((uint32_t*)temp_des); + } else if (t == (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t rest = (rows * cols) % U8_PROCESS_PER_THREADS_X; + if (rest != 0) { + size_t offset = t * U8_PROCESS_PER_THREADS_X; + src += 3 * offset; + dst += 1 * offset; + + for (int i = 0; i < rest; i++, src += 3, dst += 1) + dst[0] = (src[0] * 4899 + src[1] * 9617 + src[2] * 1868 + + (1 << 13)) >> + 14; + } + } +} + +__global__ void cvt_rgb2gray_32f_kernel(const float* src, float* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + if (t < rows * cols) { + size_t offset = t; + src += offset * 3; + dst += offset * 1; + + float temp_src[3], temp_dst; + *((float3*)temp_src) = *((float3*)src); + + temp_dst = temp_src[0] * 0.299f + temp_src[1] * 0.587f + + temp_src[2] * 0.114f; + + dst[0] = temp_dst; + } +} + +__global__ void cvt_gray2rgb_8u_kernel(const uchar* src, uchar* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + if (t < (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t offset = t * U8_PROCESS_PER_THREADS_X; + src += 1 * offset; + dst += 3 * offset; + + uchar temp_src[4], temp_des[12]; + *((uint32_t*)temp_src) = *((uint32_t*)src); + + temp_des[0] = temp_src[0]; + temp_des[1] = temp_src[0]; + temp_des[2] = temp_src[0]; + temp_des[3] = temp_src[1]; + temp_des[4] = temp_src[1]; + temp_des[5] = temp_src[1]; + temp_des[6] = temp_src[2]; + temp_des[7] = temp_src[2]; + temp_des[8] = temp_src[2]; + temp_des[9] = temp_src[3]; + temp_des[10] = temp_src[3]; + temp_des[11] = temp_src[3]; + + *((uint3*)dst) = *((uint3*)temp_des); + } else if (t == (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t rest = (rows * cols) % U8_PROCESS_PER_THREADS_X; + if (rest != 0) { + size_t offset = t * U8_PROCESS_PER_THREADS_X; + src += 1 * offset; + dst += 3 * offset; + + for (int i = 0; i < rest; i++, src += 1, dst += 3) { + uchar temp_src = src[0]; + + dst[0] = temp_src; + dst[1] = temp_src; + dst[2] = temp_src; + } + } + } +} + +__global__ void cvt_gray2rgb_32f_kernel(const float* src, float* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + if (t < rows * cols) { + src += t * 1; + dst += t * 3; + + float temp_src, temp_dst[3]; + temp_src = src[0]; + + temp_dst[0] = temp_src; + temp_dst[1] = temp_src; + temp_dst[2] = temp_src; + + *((float3*)dst) = *((float3*)temp_dst); + } +} + +#define descale(x, n) (((x) + (1 << ((n)-1))) >> (n)) + +__global__ void cvt_rgb2yuv_8u_kernel(const uchar* src, uchar* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + const int yuv_shift = 14; + const int coef[] = {1868, 9617, 4899, 8061, 14369}; + const int delta = 128 << yuv_shift; + + if (t < (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t offset_uchar = 3 * t * U8_PROCESS_PER_THREADS_X; + src += offset_uchar; + dst += offset_uchar; + + uchar temp_src[12], temp_dst[12]; + *((uint3*)temp_src) = *((uint3*)src); + + int p = 0; + int y = descale(temp_src[0 + p] * coef[0] + temp_src[1 + p] * coef[1] + + temp_src[2 + p] * coef[2], + yuv_shift); + int cr = descale((temp_src[0 + p] - y) * coef[3] + delta, yuv_shift); + int cb = descale((temp_src[2 + p] - y) * coef[4] + delta, yuv_shift); + temp_dst[0 + p] = saturate(y, 0, 255); + temp_dst[1 + p] = saturate(cr, 0, 255); + temp_dst[2 + p] = saturate(cb, 0, 255); + + p += 3; + y = descale(temp_src[0 + p] * coef[0] + temp_src[1 + p] * coef[1] + + temp_src[2 + p] * coef[2], + yuv_shift); + cr = descale((temp_src[0 + p] - y) * coef[3] + delta, yuv_shift); + cb = descale((temp_src[2 + p] - y) * coef[4] + delta, yuv_shift); + temp_dst[0 + p] = saturate(y, 0, 255); + temp_dst[1 + p] = saturate(cr, 0, 255); + temp_dst[2 + p] = saturate(cb, 0, 255); + + p += 3; + y = descale(temp_src[0 + p] * coef[0] + temp_src[1 + p] * coef[1] + + temp_src[2 + p] * coef[2], + yuv_shift); + cr = descale((temp_src[0 + p] - y) * coef[3] + delta, yuv_shift); + cb = descale((temp_src[2 + p] - y) * coef[4] + delta, yuv_shift); + temp_dst[0 + p] = saturate(y, 0, 255); + temp_dst[1 + p] = saturate(cr, 0, 255); + temp_dst[2 + p] = saturate(cb, 0, 255); + + p += 3; + y = descale(temp_src[0 + p] * coef[0] + temp_src[1 + p] * coef[1] + + temp_src[2 + p] * coef[2], + yuv_shift); + cr = descale((temp_src[0 + p] - y) * coef[3] + delta, yuv_shift); + cb = descale((temp_src[2 + p] - y) * coef[4] + delta, yuv_shift); + temp_dst[0 + p] = saturate(y, 0, 255); + temp_dst[1 + p] = saturate(cr, 0, 255); + temp_dst[2 + p] = saturate(cb, 0, 255); + + *((uint3*)dst) = *((uint3*)temp_dst); + } else if (t == (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t rest = (rows * cols) % U8_PROCESS_PER_THREADS_X; + if (rest != 0) { + size_t offset_uchar = 3 * t * U8_PROCESS_PER_THREADS_X; + src += offset_uchar; + dst += offset_uchar; + + for (int i = 0; i < rest; i++, src += 3, dst += 3) { + uchar temp_src[3], temp_dst[3]; + *((uchar3*)temp_src) = *((uchar3*)src); + + int Y = descale(temp_src[0] * coef[0] + temp_src[1] * coef[1] + + temp_src[2] * coef[2], + yuv_shift); + int Cr = + descale((temp_src[0] - Y) * coef[3] + delta, yuv_shift); + int Cb = + descale((temp_src[2] - Y) * coef[4] + delta, yuv_shift); + + temp_dst[0] = saturate(Y, 0, 255); + temp_dst[1] = saturate(Cr, 0, 255); + temp_dst[2] = saturate(Cb, 0, 255); + + *((uchar3*)dst) = *((uchar3*)temp_dst); + } + } + } +} + +__global__ void cvt_rgb2yuv_32f_kernel(const float* src, float* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + const float coef[] = {0.114f, 0.587f, 0.299f, 0.492f, 0.877f}; + const float delta = 0.5f; + + if (t < rows * cols) { + size_t offset_float = t * 3; + src += offset_float; + dst += offset_float; + + float temp_src[3], temp_dst[3]; + *((float3*)temp_src) = *((float3*)src); + + float Y = temp_src[0] * coef[0] + temp_src[1] * coef[1] + + temp_src[2] * coef[2]; + temp_dst[0] = Y; + temp_dst[1] = (temp_src[0] - Y) * coef[3] + delta; + temp_dst[2] = (temp_src[2] - Y) * coef[4] + delta; + + *((float3*)dst) = *((float3*)temp_dst); + } +} + +__global__ void cvt_yuv2rgb_8u_kernel(const uchar* src, uchar* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + const int yuv_shift = 14; + const int coef[] = {33292, -6472, -9519, 18678}; + const int delta = 128; + + if (t < (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t offset_uchar = 3 * t * U8_PROCESS_PER_THREADS_X; + src += offset_uchar; + dst += offset_uchar; + + uchar temp_src[12], temp_dst[12]; + *((uint3*)temp_src) = *((uint3*)src); + + int p = 0; + int R = temp_src[0 + p] + + descale((temp_src[1 + p] - delta) * coef[0], yuv_shift); + int G = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[2] + + (temp_src[1 + p] - delta) * coef[1], + yuv_shift); + int B = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[3], yuv_shift); + + temp_dst[0 + p] = saturate(R, 0, 255); + temp_dst[1 + p] = saturate(G, 0, 255); + temp_dst[2 + p] = saturate(B, 0, 255); + + p += 3; + R = temp_src[0 + p] + + descale((temp_src[1 + p] - delta) * coef[0], yuv_shift); + G = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[2] + + (temp_src[1 + p] - delta) * coef[1], + yuv_shift); + B = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[3], yuv_shift); + + temp_dst[0 + p] = saturate(R, 0, 255); + temp_dst[1 + p] = saturate(G, 0, 255); + temp_dst[2 + p] = saturate(B, 0, 255); + + p += 3; + R = temp_src[0 + p] + + descale((temp_src[1 + p] - delta) * coef[0], yuv_shift); + G = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[2] + + (temp_src[1 + p] - delta) * coef[1], + yuv_shift); + B = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[3], yuv_shift); + + temp_dst[0 + p] = saturate(R, 0, 255); + temp_dst[1 + p] = saturate(G, 0, 255); + temp_dst[2 + p] = saturate(B, 0, 255); + + p += 3; + R = temp_src[0 + p] + + descale((temp_src[1 + p] - delta) * coef[0], yuv_shift); + G = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[2] + + (temp_src[1 + p] - delta) * coef[1], + yuv_shift); + B = temp_src[0 + p] + + descale((temp_src[2 + p] - delta) * coef[3], yuv_shift); + + temp_dst[0 + p] = saturate(R, 0, 255); + temp_dst[1 + p] = saturate(G, 0, 255); + temp_dst[2 + p] = saturate(B, 0, 255); + + *((uint3*)dst) = *((uint3*)temp_dst); + } else if (t == (rows * cols) / U8_PROCESS_PER_THREADS_X) { + size_t rest = (rows * cols) % U8_PROCESS_PER_THREADS_X; + if (rest != 0) { + size_t offset_uchar = 3 * t * U8_PROCESS_PER_THREADS_X; + src += offset_uchar; + dst += offset_uchar; + + for (int i = 0; i < rest; i++, src += 3, dst += 3) { + uchar Y = src[0], Cr = src[1], Cb = src[2]; + + int R = Y + descale((Cr - delta) * coef[0], yuv_shift); + int G = Y + + descale((Cb - delta) * coef[2] + (Cr - delta) * coef[1], + yuv_shift); + int B = Y + descale((Cb - delta) * coef[3], yuv_shift); + + dst[0] = saturate(R, 0, 255); + dst[1] = saturate(G, 0, 255); + dst[2] = saturate(B, 0, 255); + } + } + } +} + +__global__ void cvt_yuv2rgb_32f_kernel(const float* src, float* dst, + const size_t rows, const size_t cols, + const size_t src_step, + const size_t dst_step) { + size_t t = blockIdx.x * blockDim.x + threadIdx.x; + + const float coef[] = {2.032f, -0.395f, -0.581f, 1.140f}; + const float delta = 0.5f; + + if (t < rows * cols) { + size_t offset_float = t * 3; + src += offset_float; + dst += offset_float; + + float Y = src[0]; + float Cr = src[1]; + float Cb = src[2]; + + float R = Y + (Cr - delta) * coef[0]; + float G = Y + (Cb - delta) * coef[2] + (Cr - delta) * coef[1]; + float B = Y + (Cb - delta) * coef[3]; + + dst[0] = R; + dst[1] = G; + dst[2] = B; + } +} + +// convert planar or semi-planar YUV to gray. data type: uint8 +__global__ void cvt_yuv2gray_psp_8u_kernel(const uchar* src, uchar* dst, + const size_t dst_rows, + const size_t dst_cols, + const size_t src_step, + const size_t dst_step) { + int c = (blockIdx.x * blockDim.x + threadIdx.x) * U8_PROCESS_PER_THREADS_X; + int r = blockIdx.y * blockDim.y + threadIdx.y; + src += r * src_step + c; + dst += r * dst_step + c; + int remain = dst_cols - c; + if (remain > U8_PROCESS_PER_THREADS_X) + remain = U8_PROCESS_PER_THREADS_X; + for (int i = 0; i < remain; ++i) + *(dst++) = *(src++); +} + +// convert semi-planar YUV to RGB or BGR. data type: uint8 +// is_rgb: convert to RGB if true, otherwise convert to BGR +// is_nv12: decode src as YUV_NV12 if true, YUV_NV21 otherwise +template +__global__ void cvt_yuv2rgbbgr_sp_8u_kernel(const uchar* src, uchar* dst, + const size_t dst_rows, + const size_t dst_cols, + const size_t src_step, + const size_t dst_step) { + int c = (blockIdx.x * blockDim.x + threadIdx.x) * 2; + int r = (blockIdx.y * blockDim.y + threadIdx.y) * 2; + if (c >= dst_cols || r >= dst_rows) + return; + + dst += r * dst_step + c * 3; + + const uchar* pY = src + r * src_step + c; + int Y00 = *pY; + int Y01 = *(pY + 1); + int Y10 = *(pY + src_step); + int Y11 = *(pY + src_step + 1); + + const uchar* pUV = src + (dst_rows + r / 2) * src_step + c; + int U, V; + if (is_nv12) { + U = *pUV; + V = *(pUV + 1); + } else { + V = *pUV; + U = *(pUV + 1); + } + + int ruv = ((359 * (V - 128)) >> 8); + int guv = -1 * ((88 * (U - 128) + 183 * (V - 128)) >> 8); + int buv = ((454 * (U - 128)) >> 8); + +#define SET_COLOR \ + if (is_rgb) { \ + dst[0] = saturate(R, 0, 255); \ + dst[1] = saturate(G, 0, 255); \ + dst[2] = saturate(B, 0, 255); \ + } else { \ + dst[0] = saturate(B, 0, 255); \ + dst[1] = saturate(G, 0, 255); \ + dst[2] = saturate(R, 0, 255); \ + } + + int R = Y00 + ruv; + int G = Y00 + guv; + int B = Y00 + buv; + SET_COLOR + dst += 3; + + R = Y01 + ruv; + G = Y01 + guv; + B = Y01 + buv; + SET_COLOR + dst += dst_step - 3; + + R = Y10 + ruv; + G = Y10 + guv; + B = Y10 + buv; + SET_COLOR + dst += 3; + + R = Y11 + ruv; + G = Y11 + guv; + B = Y11 + buv; + SET_COLOR + +#undef SET_COLOR +} + +// convert planar YUV to RGB or BGR. data type: uint8 +// is_rgb: convert to RGB if true, otherwise convert to BGR +// is_nv12: decode src as YUV_NV12 if true, YUV_NV21 otherwise +template +__global__ void cvt_yuv2rgbbgr_p_8u_kernel(const uchar* src, uchar* dst, + const size_t dst_rows, + const size_t dst_cols, + const size_t src_step, + const size_t dst_step) { + int c = (blockIdx.x * blockDim.x + threadIdx.x) * 2; + int r = (blockIdx.y * blockDim.y + threadIdx.y) * 2; + if (c >= dst_cols || r >= dst_rows) + return; + + dst += r * dst_step + c * 3; + + const uchar* pY = src + r * src_step + c; + int Y00 = *pY; + int Y01 = *(pY + 1); + int Y10 = *(pY + src_step); + int Y11 = *(pY + src_step + 1); + + size_t u_offset, v_offset; + if (is_yu12) { + u_offset = dst_rows * src_step + (r / 2) * (src_step / 2) + c / 2; + v_offset = u_offset + (dst_rows / 4) * src_step; + } else { + v_offset = dst_rows * src_step + (r / 2) * (src_step / 2) + c / 2; + u_offset = v_offset + (dst_rows / 4) * src_step; + } + int U = src[u_offset], V = src[v_offset]; + + int ruv = ((359 * (V - 128)) >> 8); + int guv = -1 * ((88 * (U - 128) + 183 * (V - 128)) >> 8); + int buv = ((454 * (U - 128)) >> 8); + +#define SET_COLOR \ + if (is_rgb) { \ + dst[0] = saturate(R, 0, 255); \ + dst[1] = saturate(G, 0, 255); \ + dst[2] = saturate(B, 0, 255); \ + } else { \ + dst[0] = saturate(B, 0, 255); \ + dst[1] = saturate(G, 0, 255); \ + dst[2] = saturate(R, 0, 255); \ + } + + int R = Y00 + ruv; + int G = Y00 + guv; + int B = Y00 + buv; + SET_COLOR + dst += 3; + + R = Y01 + ruv; + G = Y01 + guv; + B = Y01 + buv; + SET_COLOR + dst += dst_step - 3; + + R = Y10 + ruv; + G = Y10 + guv; + B = Y10 + buv; + SET_COLOR + dst += 3; + + R = Y11 + ruv; + G = Y11 + guv; + B = Y11 + buv; + SET_COLOR + +#undef SET_COLOR +} + +#define CALL_CVT_OPR_8U_KERNEL(_func) \ + { \ + dim3 THREADS(THREADS_X); \ + dim3 BLOCKS(DIVUP(src_cols* src_rows, \ + THREADS_X* U8_PROCESS_PER_THREADS_X)); \ + cvt_##_func##_8u_kernel<<>>( \ + src, dst, src_rows, src_cols, src_step, dst_step); \ + } + +#define CALL_CVT_OPR_32F_KERNEL(_func) \ + { \ + dim3 THREADS(THREADS_X); \ + dim3 BLOCKS(DIVUP(src_cols* src_rows, THREADS_X)); \ + cvt_##_func##_32f_kernel<<>>( \ + src, dst, src_rows, src_cols, src_step, dst_step); \ + } + +// convert planar or semi-planar YUV to gray, data tyoe: uint8 +#define CALL_CVT_YUV2GRAY_PSP_OPR_8U_KERNEL \ + { \ + dim3 THREADS(THREADS_X, 1); \ + dim3 BLOCKS(DIVUP(dst_cols, THREADS_X* U8_PROCESS_PER_THREADS_X), \ + dst_rows); \ + cvt_yuv2gray_psp_8u_kernel<<>>( \ + src, dst, dst_rows, dst_cols, src_step, dst_step); \ + } + +// convert semi-planar YUV to RGB or BGR. data type: uint8 +// is_rgb: convert to RGB if true, otherwise convert to BGR +// is_nv12: decode src as YUV_NV12 if true, YUV_NV21 otherwise +#define CALL_CVT_YUV2RGBBGR_SP_OPR_8U_KERNEL(is_rgb, is_nv12) \ + { \ + dim3 THREADS(THREADS_X, THREADS_Y); \ + dim3 BLOCKS(DIVUP(dst_cols / 2, THREADS_X), \ + DIVUP(dst_rows / 2, THREADS_Y)); \ + cvt_yuv2rgbbgr_sp_8u_kernel \ + <<>>(src, dst, dst_rows, dst_cols, \ + src_step, dst_step); \ + } + +// convert planar YUV to RGB or BGR. data type: uint8 +// is_rgb: convert to RGB if true, otherwise convert to BGR +// is_yu12: decode src as YUV_YU12 if true, YUV_YV12 otherwise +#define CALL_CVT_YUV2RGBBGR_P_OPR_8U_KERNEL(is_rgb, is_yu12) \ + { \ + dim3 THREADS(THREADS_X, THREADS_Y); \ + dim3 BLOCKS(DIVUP(dst_cols / 2, THREADS_X), \ + DIVUP(dst_rows / 2, THREADS_Y)); \ + cvt_yuv2rgbbgr_p_8u_kernel \ + <<>>(src, dst, dst_rows, dst_cols, \ + src_step, dst_step); \ + } + +using namespace param_enumv; + +void cvt_color_8u_proxy(const uchar* src, uchar* dst, const size_t src_rows, + const size_t src_cols, const size_t src_step, + const size_t dst_rows, const size_t dst_cols, + const size_t dst_step, const uint32_t mode, + cudaStream_t stream) { + switch (mode) { + case CvtColor::Mode::RGB2GRAY: + CALL_CVT_OPR_8U_KERNEL(rgb2gray) + break; + case CvtColor::Mode::RGB2YUV: + CALL_CVT_OPR_8U_KERNEL(rgb2yuv) + break; + case CvtColor::Mode::YUV2RGB: + CALL_CVT_OPR_8U_KERNEL(yuv2rgb) + break; + case CvtColor::Mode::GRAY2RGB: + CALL_CVT_OPR_8U_KERNEL(gray2rgb) + break; + case CvtColor::Mode::YUV2GRAY_NV12: + case CvtColor::Mode::YUV2GRAY_NV21: + case CvtColor::Mode::YUV2GRAY_YU12: + case CvtColor::Mode::YUV2GRAY_YV12: + CALL_CVT_YUV2GRAY_PSP_OPR_8U_KERNEL + break; + case CvtColor::Mode::YUV2RGB_NV12: + CALL_CVT_YUV2RGBBGR_SP_OPR_8U_KERNEL(true, true) + break; + case CvtColor::Mode::YUV2RGB_NV21: + CALL_CVT_YUV2RGBBGR_SP_OPR_8U_KERNEL(true, false) + break; + case CvtColor::Mode::YUV2BGR_NV12: + CALL_CVT_YUV2RGBBGR_SP_OPR_8U_KERNEL(false, true) + break; + case CvtColor::Mode::YUV2BGR_NV21: + CALL_CVT_YUV2RGBBGR_SP_OPR_8U_KERNEL(false, false) + break; + case CvtColor::Mode::YUV2RGB_YU12: + CALL_CVT_YUV2RGBBGR_P_OPR_8U_KERNEL(true, true); + break; + case CvtColor::Mode::YUV2RGB_YV12: + CALL_CVT_YUV2RGBBGR_P_OPR_8U_KERNEL(true, false); + break; + case CvtColor::Mode::YUV2BGR_YU12: + CALL_CVT_YUV2RGBBGR_P_OPR_8U_KERNEL(false, true); + break; + case CvtColor::Mode::YUV2BGR_YV12: + CALL_CVT_YUV2RGBBGR_P_OPR_8U_KERNEL(false, false); + break; + default: + megdnn_throw("unsupported cvt_color mode for cuda"); + break; + } +} + +void cvt_color_32f_proxy(const float* src, float* dst, const size_t src_rows, + const size_t src_cols, const size_t src_step, + const size_t dst_rows, const size_t dst_cols, + const size_t dst_step, const uint32_t mode, + cudaStream_t stream) { + MEGDNN_MARK_USED_VAR(dst_rows); + MEGDNN_MARK_USED_VAR(dst_cols); + switch (mode) { + case CvtColor::Mode::RGB2GRAY: + CALL_CVT_OPR_32F_KERNEL(rgb2gray) + break; + case CvtColor::Mode::RGB2YUV: + CALL_CVT_OPR_32F_KERNEL(rgb2yuv) + break; + case CvtColor::Mode::YUV2RGB: + CALL_CVT_OPR_32F_KERNEL(yuv2rgb) + break; + case CvtColor::Mode::GRAY2RGB: + CALL_CVT_OPR_32F_KERNEL(gray2rgb) + break; + default: + megdnn_throw("unsupported cvt_color mode for cuda"); + break; + } +} + +#undef CALL_CVT_OPR_8U_KERNEL +#undef CALL_CVT_OPR_32F_KERNEL +#undef CALL_CVT_YUV2GRAY_PSP_OPR_8U_KERNEL +#undef CALL_CVT_YUV2RGBBGR_SP_OPR_8U_KERNEL +#undef CALL_CVT_YUV2RGBBGR_P_OPR_8U_KERNEL + +} // namespace cvt_color +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cvt_color/cvt_color.cuh b/dnn/src/cuda/cvt_color/cvt_color.cuh new file mode 100644 index 00000000..741a80fe --- /dev/null +++ b/dnn/src/cuda/cvt_color/cvt_color.cuh @@ -0,0 +1,87 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/cuda/cvt_color/cvt_color.cuh + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ +#pragma once + +#include +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace cvt_color { + +typedef unsigned char uchar; + +void cvt_color_8u_proxy(const uchar* src, uchar* dst, const size_t src_rows, + const size_t src_cols, const size_t src_step, + const size_t dst_rows, const size_t dst_cols, + const size_t dst_step, const uint32_t mode, + cudaStream_t stream); + +void cvt_color_32f_proxy(const float* src, float* dst, const size_t src_rows, + const size_t src_cols, const size_t src_step, + const size_t dst_rows, const size_t dst_cols, + const size_t dst_step, const uint32_t mode, + cudaStream_t stream); + +} // namespace cvt_color +} // namespace cuda +} // namespace megdnn + // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/cvt_color/opr_impl.cpp b/dnn/src/cuda/cvt_color/opr_impl.cpp new file mode 100644 index 00000000..3de2d8aa --- /dev/null +++ b/dnn/src/cuda/cvt_color/opr_impl.cpp @@ -0,0 +1,72 @@ +/** + * \file dnn/src/cuda/cvt_color/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/cvt_color/opr_impl.h" +#include "src/cuda/cvt_color/cvt_color.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include "src/common/cv/common.h" +#include "src/common/cv/helper.h" +#include "src/common/cv/cvt_color.h" + +#include + +namespace megdnn { +namespace cuda { + +using namespace megcv; +using namespace cvt_color; + + +void CvtColorImpl::cvt_color_exec_8u(_megdnn_tensor_in src_tensor, + _megdnn_tensor_in dst_tensor) { + auto stream = cuda_stream(this->handle()); + for (size_t i = 0; i < src_tensor.layout.shape[0]; ++i) { + Mat src = TensorND2Mat(src_tensor, i); + Mat dst = TensorND2Mat(dst_tensor, i); + + cvt_color_8u_proxy(src.ptr(), dst.ptr(), src.rows(), src.cols(), + src.step(), dst.rows(), dst.cols(), dst.step(), + static_cast(param().mode), stream); + } +} + +void CvtColorImpl::cvt_color_exec_32f(_megdnn_tensor_in src_tensor, + _megdnn_tensor_in dst_tensor) { + auto stream = cuda_stream(this->handle()); + for (size_t i = 0; i < src_tensor.layout.shape[0]; ++i) { + Mat src = TensorND2Mat(src_tensor, i); + Mat dst = TensorND2Mat(dst_tensor, i); + + cvt_color_32f_proxy(src.ptr(), dst.ptr(), src.rows(), src.cols(), + src.step(), dst.rows(), dst.cols(), dst.step(), + static_cast(param().mode), stream); + } +} + +void CvtColorImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, + _megdnn_workspace workspace) { + using namespace megcv; + check_exec(src.layout, dst.layout, workspace.size); + + if (dst.layout.dtype == dtype::Float32()) { + cvt_color_exec_32f(src, dst); + } else if (dst.layout.dtype == dtype::Uint8()) { + cvt_color_exec_8u(src, dst); + } else { + megdnn_throw("Unsupported datatype of Resize optr."); + } +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/cuda/cvt_color/opr_impl.h b/dnn/src/cuda/cvt_color/opr_impl.h new file mode 100644 index 00000000..ce60e3fe --- /dev/null +++ b/dnn/src/cuda/cvt_color/opr_impl.h @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/cvt_color/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class CvtColorImpl : public CvtColor { +private: + void cvt_color_exec_8u(_megdnn_tensor_in src, _megdnn_tensor_in dst); + void cvt_color_exec_32f(_megdnn_tensor_in src, _megdnn_tensor_in dst); + +public: + using CvtColor::CvtColor; + + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout&, + const TensorLayout&) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/bwd_data/algo.cpp b/dnn/src/cuda/deformable_conv/bwd_data/algo.cpp new file mode 100644 index 00000000..ce3fefed --- /dev/null +++ b/dnn/src/cuda/deformable_conv/bwd_data/algo.cpp @@ -0,0 +1,88 @@ +/** + * \file dnn/src/cuda/deformable_conv/bwd_data/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/deformable_conv/bwd_data/algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +using OprImpl = DeformableConvBackwardDataImpl; + +OprImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&algo_matmul); +} + +OprImpl::AlgoPack OprImpl::sm_algo_pack; + +OprImpl::AlgoBase::SizeArgs::SizeArgs( + OprImpl* o, const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad) + : SizeArgs(o, im, + o->make_canonized_filter_meta(im.ndim, filter, offset), + offset, mask, out_grad, im_grad, offset_grad, mask_grad) {} + +OprImpl::AlgoBase::SizeArgs::SizeArgs( + OprImpl* o, const TensorLayout& im, const CanonizedFilterMeta& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad) + : opr(o), + handle(concrete_handle(o->handle())), + im_layout(im), + filter_meta(filter), + offset_layout(offset), + mask_layout(mask), + out_grad_layout(out_grad), + im_grad_layout(im_grad), + offset_grad_layout(offset_grad), + mask_grad_layout(mask_grad) {} + +OprImpl::AlgoBase::ExecArgs::ExecArgs( + OprImpl* opr, _megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, + _megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, + _megdnn_workspace ws) + : SizeArgs(opr, im.layout, filter.layout, offset.layout, mask.layout, + out_grad.layout, im_grad.layout, offset_grad.layout, + mask_grad.layout), + im_tensor(im), + filter_tensor(filter), + offset_tensor(offset), + mask_tensor(mask), + out_grad_tensor(out_grad), + im_grad_tensor(im_grad), + offset_grad_tensor(offset_grad), + mask_grad_tensor(mask_grad), + workspace(ws) {} + +std::string OprImpl::AlgoBase::SizeArgs::to_string() const { + auto&& fm = filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return ssprintf( + "im=%s, filter=%u{%u,%u,%u,%u}, offset=%s, mask=%s, " + "dst_grad=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, " + "dtype=%s,%s", + megdnn_layout_msg(im_layout).c_str(), fm.group, fm.ocpg, fm.icpg, + fm.spatial[0], fm.spatial[1], + megdnn_layout_msg(offset_layout).c_str(), + megdnn_layout_msg(mask_layout).c_str(), + megdnn_layout_msg(out_grad_layout).c_str(), fm.padding[0], + fm.padding[1], fm.stride[0], fm.stride[1], fm.dilation[0], + fm.dilation[1], !fm.should_flip, im_layout.dtype.name(), + out_grad_layout.dtype.name()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/bwd_data/algo.h b/dnn/src/cuda/deformable_conv/bwd_data/algo.h new file mode 100644 index 00000000..d16a66eb --- /dev/null +++ b/dnn/src/cuda/deformable_conv/bwd_data/algo.h @@ -0,0 +1,125 @@ +/** + * \file dnn/src/cuda/deformable_conv/bwd_data/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/handle.h" + +#include "src/cuda/deformable_conv/opr_impl.h" + +namespace megdnn { +namespace cuda { + +class DeformableConvBackwardDataImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + DeformableConvBackwardDataImpl* opr; + HandleImpl* handle; + const TensorLayout& im_layout; + CanonizedFilterMeta filter_meta; + const TensorLayout& offset_layout; + const TensorLayout& mask_layout; + const TensorLayout& out_grad_layout; + const TensorLayout& im_grad_layout; + const TensorLayout& offset_grad_layout; + const TensorLayout& mask_grad_layout; + + std::string to_string() const; + + SizeArgs(DeformableConvBackwardDataImpl* opr, const TensorLayout& im, + const TensorLayout& filter, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + const TensorLayout& im_grad, const TensorLayout& offset_grad, + const TensorLayout& mask_grad); + + SizeArgs(DeformableConvBackwardDataImpl* opr, const TensorLayout& im, + const CanonizedFilterMeta& filter, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + const TensorLayout& im_grad, const TensorLayout& offset_grad, + const TensorLayout& mask_grad); + }; + struct ExecArgs : public SizeArgs { + const TensorND im_tensor, filter_tensor, offset_tensor, mask_tensor, + out_grad_tensor; + TensorND im_grad_tensor, offset_grad_tensor, mask_grad_tensor; + Workspace workspace; + + ExecArgs(DeformableConvBackwardDataImpl* opr, _megdnn_tensor_in im, + _megdnn_tensor_in filter, _megdnn_tensor_in offset, + _megdnn_tensor_in mask, _megdnn_tensor_in out_grad, + _megdnn_tensor_out im_grad, _megdnn_tensor_out offset_grad, + _megdnn_tensor_out mask_grad, _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert( + req <= workspace.size, + "deformable_conv bwd_data algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class DeformableConvBackwardDataImpl::AlgoMatmul final : public AlgoBase { +private: + static WorkspaceBundle get_bundle(const SizeArgs& args); + + static void get_matmul_layout(const SizeArgs& args, TensorLayout& al, + TensorLayout& bl, TensorLayout& cl); + +public: + AlgoMatmul() {} + + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { return "AlgoMatmul"; } +}; + +class DeformableConvBackwardDataImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + AlgoMatmul algo_matmul; + //! all algorithms + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp b/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp new file mode 100644 index 00000000..5083f1bd --- /dev/null +++ b/dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp @@ -0,0 +1,181 @@ +/** + * \file dnn/src/cuda/deformable_conv/bwd_data/algo_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.h" + +#include "src/cuda/deformable_conv/bwd_data/algo.h" +#include "src/cuda/deformable_conv/kimpl/deformable_conv.cuh" +#include "src/cuda/deformable_conv/opr_impl.h" + +using namespace megdnn; +using namespace cuda; + +using Algo = DeformableConvBackwardDataImpl::AlgoMatmul; +using OprParam = DeformableConvBase::Param; + +namespace { +deformable_conv::Param create_param(const Algo::SizeArgs& args, + const OprParam& opr_param, + cublasHandle_t handle, + cudaStream_t stream) { + deformable_conv::Param p; + auto&& fm = args.filter_meta; + + p.handle = handle; + p.stream = stream; + p.group = fm.group; + p.deformable_group = fm.deformable_group; + p.batch_sz = args.im_layout[0]; + + p.IC = args.im_layout[1]; + p.IH = args.im_layout[2]; + p.IW = args.im_layout[3]; + p.OC = args.out_grad_layout[1]; + p.OH = args.out_grad_layout[2]; + p.OW = args.out_grad_layout[3]; + p.FH = fm.spatial[0]; + p.FW = fm.spatial[1]; + p.PH = opr_param.pad_h; + p.PW = opr_param.pad_w; + p.SH = opr_param.stride_h; + p.SW = opr_param.stride_w; + p.DH = opr_param.dilate_h; + p.DW = opr_param.dilate_w; + + p.icpg = p.IC / p.group; + p.icpdg = p.IC / p.deformable_group; + p.ocpg = p.OC / p.group; + p.ocpdg = p.OC / p.deformable_group; + + return p; +} +}; // anonymous namespace + +bool Algo::is_available(const SizeArgs&) const { + return true; +} + +void Algo::get_matmul_layout(const SizeArgs& args, TensorLayout& al, + TensorLayout& bl, TensorLayout& cl) { + auto&& dt = args.im_layout.dtype; + auto&& fm = args.filter_meta; + size_t batch_sz = args.im_layout[0], OH = args.out_grad_layout[2], + OW = args.out_grad_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + size_t M = fm.icpg * FH * FW, K = fm.ocpg, N = batch_sz * OH * OW, + batch = fm.group; + al = {{batch, K, M}, dt}; + bl = {{batch, K, N}, dt}; + cl = {{batch, M, N}, dt}; +} + +WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { + auto&& fm = args.filter_meta; + size_t batch_sz = args.im_layout[0], IC = fm.group * fm.icpg, + OC = args.out_grad_layout[1], OH = args.out_grad_layout[2], + OW = args.out_grad_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + auto&& bmm_opr = args.handle->create_operator(); + TensorLayout al, bl, cl; + + get_matmul_layout(args, al, bl, cl); + bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + bmm_opr->param().transposeA = true; + + size_t bmm_ws = bmm_opr->get_workspace_in_bytes(al, bl, cl); + size_t result_ws = batch_sz * IC * FH * FW * OH * OW * sizeof(float); + size_t relayout_ws1 = batch_sz * OC * OH * OW * sizeof(float); + size_t relayout_ws2 = batch_sz * IC * FH * FW * OH * OW * sizeof(float); + + return {nullptr, {bmm_ws, result_ws, relayout_ws1, relayout_ws2}}; +} + +size_t Algo::get_workspace_in_bytes(const SizeArgs& args) const { + return get_bundle(args).total_size_in_bytes(); +} + +void Algo::exec(const ExecArgs& args) const { + auto&& opr = args.opr; + auto&& handle = concrete_handle(opr->handle()); + auto&& param = opr->param(); + auto p = create_param(args, param, handle->cublas_handle(), + handle->stream()); + auto bundle = get_bundle(args); + bundle.set(args.workspace.raw_ptr); + + float* dev_im = args.im_tensor.ptr(); + float* dev_filter = args.filter_tensor.ptr(); + float* dev_offset = args.offset_tensor.ptr(); + float* dev_mask = args.mask_tensor.ptr(); + float* dev_out_grad = args.out_grad_tensor.ptr(); + + float* dev_im_grad = args.im_grad_tensor.ptr(); + float* dev_offset_grad = args.offset_grad_tensor.ptr(); + float* dev_mask_grad = args.mask_grad_tensor.ptr(); + + void* bmm_ws = bundle.get(0); + float* result_ws = static_cast(bundle.get(1)); + float* relayout_ws1 = static_cast(bundle.get(2)); + + // clear out grad + { + size_t im_sz = p.batch_sz * p.IC * p.IH * p.IW * sizeof(float); + size_t offset_sz = p.batch_sz * 2 * p.deformable_group * p.FH * p.FW * + p.OH * p.OW * sizeof(float); + size_t mask_sz = p.batch_sz * p.deformable_group * p.FH * p.FW * p.OH * + p.OW * sizeof(float); + + cudaMemsetAsync(dev_im_grad, 0, im_sz, p.stream); + cudaMemsetAsync(dev_offset_grad, 0, offset_sz, p.stream); + cudaMemsetAsync(dev_mask_grad, 0, mask_sz, p.stream); + } + + // relayout out_grad to [oc, N, OH, OW] + { + auto&& dt = args.im_layout.dtype; + size_t dim0 = p.batch_sz, dim1 = p.OC, dim2 = p.OH * p.OW; + TensorLayout C2l({dim0, dim1, dim2}, dt), C3l = C2l; + C3l.stride[0] = dim2; + C3l.stride[1] = dim0 * dim2; + C3l.stride[2] = 1; + TensorND C2(dev_out_grad, C2l); + TensorND C3(relayout_ws1, C3l); + + args.handle->relayout_opr()->exec(C2, C3); + } + // matmul [g, icpg, FH, FW, ocpg] * [g, ocpg, N, OH, OW] => + // => [g, icpg, FH, FW, N, OH, OW] + { + TensorLayout al, bl, cl; + get_matmul_layout(args, al, bl, cl); + + TensorND A(static_cast(dev_filter), al), + B(static_cast(relayout_ws1), bl), + C(static_cast(result_ws), cl); + + size_t bmm_ws_size = bundle.get_size(0); + auto&& bmm_opr = + args.handle->create_operator(); + + bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + bmm_opr->param().transposeA = true; + + bmm_opr->exec( + A, B, C, + Workspace(static_cast(bmm_ws), bmm_ws_size)); + } + col2im(result_ws, dev_offset, dev_mask, dev_im_grad, p); + // col [IC, FH * FW, N, OH * OW] + col2im_coord(dev_im, result_ws, dev_offset, dev_mask, dev_offset_grad, + dev_mask_grad, p); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/bwd_flt/algo.cpp b/dnn/src/cuda/deformable_conv/bwd_flt/algo.cpp new file mode 100644 index 00000000..a7c37236 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/bwd_flt/algo.cpp @@ -0,0 +1,81 @@ +/** + * \file dnn/src/cuda/deformable_conv/bwd_flt/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.h" + +#include "src/cuda/deformable_conv/bwd_flt/algo.h" + +using namespace megdnn; +using namespace cuda; + +using OprImpl = DeformableConvBackwardFilterImpl; + +OprImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&algo_matmul); +} + +OprImpl::AlgoPack OprImpl::sm_algo_pack; + +OprImpl::AlgoBase::SizeArgs::SizeArgs(OprImpl* o, const TensorLayout& im, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& out_grad, + const TensorLayout& filter_grad) + : SizeArgs( + o, im, offset, mask, out_grad, + o->make_canonized_filter_meta(im.ndim, filter_grad, offset)) { +} + +OprImpl::AlgoBase::SizeArgs::SizeArgs( + OprImpl* o, const TensorLayout& im, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + const CanonizedFilterMeta& filter_grad_meta) + : opr(o), + handle(concrete_handle(o->handle())), + im_layout(im), + offset_layout(offset), + mask_layout(mask), + out_grad_layout(out_grad), + filter_grad_meta(filter_grad_meta) {} + +OprImpl::AlgoBase::ExecArgs::ExecArgs(OprImpl* opr, _megdnn_tensor_in im, + _megdnn_tensor_in offset, + _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, + _megdnn_tensor_out filter_grad, + _megdnn_workspace ws) + : SizeArgs(opr, im.layout, offset.layout, mask.layout, out_grad.layout, + filter_grad.layout), + im_tensor(im), + offset_tensor(offset), + mask_tensor(mask), + out_grad_tensor(out_grad), + filter_grad_tensor(filter_grad), + workspace(ws) {} + +std::string OprImpl::AlgoBase::SizeArgs::to_string() const { + auto&& fm = filter_grad_meta; + MEGDNN_MARK_USED_VAR(fm); + return ssprintf("im=%s, offset=%s, mask=%s, dst_grad=%s, " + "filter_grad=%u{%u,%u,%u,%u}," + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, " + "dtype=%s,%s", + megdnn_layout_msg(im_layout).c_str(), + megdnn_layout_msg(offset_layout).c_str(), + megdnn_layout_msg(mask_layout).c_str(), + megdnn_layout_msg(out_grad_layout).c_str(), fm.group, + fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], !fm.should_flip, + im_layout.dtype.name(), out_grad_layout.dtype.name()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/bwd_flt/algo.h b/dnn/src/cuda/deformable_conv/bwd_flt/algo.h new file mode 100644 index 00000000..ad9a9329 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/bwd_flt/algo.h @@ -0,0 +1,116 @@ +/** + * \file dnn/src/cuda/deformable_conv/bwd_flt/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/handle.h" + +#include "src/cuda/deformable_conv/opr_impl.h" + +namespace megdnn { +namespace cuda { + +class DeformableConvBackwardFilterImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + DeformableConvBackwardFilterImpl* opr; + HandleImpl* handle; + const TensorLayout& im_layout; + const TensorLayout& offset_layout; + const TensorLayout& mask_layout; + const TensorLayout& out_grad_layout; + CanonizedFilterMeta filter_grad_meta; + + std::string to_string() const; + + SizeArgs(DeformableConvBackwardFilterImpl* opr, const TensorLayout& im, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& filter_grad); + + SizeArgs(DeformableConvBackwardFilterImpl* opr, const TensorLayout& im, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, + const CanonizedFilterMeta& filter_grad_meta); + }; + struct ExecArgs : public SizeArgs { + const TensorND im_tensor, offset_tensor, mask_tensor, out_grad_tensor; + TensorND filter_grad_tensor; + Workspace workspace; + + ExecArgs(DeformableConvBackwardFilterImpl* opr, _megdnn_tensor_in im, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out filter_grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "deformable_conv bwd_flt algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class DeformableConvBackwardFilterImpl::AlgoMatmul final : public AlgoBase { +private: + static void get_matmul_layout(const SizeArgs& args, TensorLayout& al, + TensorLayout& bl, TensorLayout& cl); + static WorkspaceBundle get_bundle(const SizeArgs& args); + +public: + AlgoMatmul() {} + + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { return "AlgoMatmul"; } +}; + +class DeformableConvBackwardFilterImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + AlgoMatmul algo_matmul; + //! all algorithms + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp b/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp new file mode 100644 index 00000000..4efd4204 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp @@ -0,0 +1,158 @@ +/** + * \file dnn/src/cuda/deformable_conv/bwd_flt/algo_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.h" + +#include "src/cuda/deformable_conv/bwd_flt/algo.h" +#include "src/cuda/deformable_conv/kimpl/deformable_conv.cuh" +#include "src/cuda/deformable_conv/opr_impl.h" + +using namespace megdnn; +using namespace cuda; + +using Algo = DeformableConvBackwardFilterImpl::AlgoMatmul; +using OprParam = DeformableConvBase::Param; + +namespace { +deformable_conv::Param create_param(const Algo::SizeArgs& args, + const OprParam& opr_param, + cublasHandle_t handle, + cudaStream_t stream) { + deformable_conv::Param p; + auto&& fm = args.filter_grad_meta; + + p.handle = handle; + p.stream = stream; + p.group = fm.group; + p.deformable_group = fm.deformable_group; + p.batch_sz = args.im_layout[0]; + + p.IC = args.im_layout[1]; + p.IH = args.im_layout[2]; + p.IW = args.im_layout[3]; + p.OC = args.out_grad_layout[1]; + p.OH = args.out_grad_layout[2]; + p.OW = args.out_grad_layout[3]; + p.FH = fm.spatial[0]; + p.FW = fm.spatial[1]; + p.PH = opr_param.pad_h; + p.PW = opr_param.pad_w; + p.SH = opr_param.stride_h; + p.SW = opr_param.stride_w; + p.DH = opr_param.dilate_h; + p.DW = opr_param.dilate_w; + + p.icpg = p.IC / p.group; + p.icpdg = p.IC / p.deformable_group; + p.ocpg = p.OC / p.group; + p.ocpdg = p.OC / p.deformable_group; + + return p; +} +}; // anonymous namespace + +bool Algo::is_available(const SizeArgs&) const { + return true; +} + +void Algo::get_matmul_layout(const SizeArgs& args, TensorLayout& al, + TensorLayout& bl, TensorLayout& cl) { + auto&& dt = args.im_layout.dtype; + auto&& fm = args.filter_grad_meta; + size_t batch_sz = args.im_layout[0], OH = args.out_grad_layout[2], + OW = args.out_grad_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + size_t M = fm.ocpg, K = OH * OW * batch_sz, N = fm.icpg * FH * FW, + batch = fm.group; + + al = {{batch, M, K}, dt}; + bl = {{batch, N, K}, dt}; + cl = {{batch, M, N}, dt}; +} + +WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { + auto&& fm = args.filter_grad_meta; + auto OH = args.out_grad_layout[2], OW = args.out_grad_layout[3]; + auto FH = fm.spatial[0], FW = fm.spatial[1]; + size_t IC = fm.group * fm.icpg, OC = args.out_grad_layout[1]; + auto batch_sz = args.im_layout[0]; + + auto&& bmm_opr = args.handle->create_operator(); + TensorLayout al, bl, cl; + + get_matmul_layout(args, al, bl, cl); + bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + bmm_opr->param().transposeB = true; + + size_t col_ws = batch_sz * IC * FH * FW * OH * OW * sizeof(float); + size_t out_grad_ws = batch_sz * OC * OH * OW * sizeof(float); + size_t bmm_ws = bmm_opr->get_workspace_in_bytes(al, bl, cl); + + return {nullptr, {col_ws, out_grad_ws, bmm_ws}}; +} + +size_t Algo::get_workspace_in_bytes(const SizeArgs& args) const { + return get_bundle(args).total_size_in_bytes(); +} + +void Algo::exec(const ExecArgs& args) const { + auto&& opr = args.opr; + auto&& param = opr->param(); + auto&& handle = concrete_handle(opr->handle()); + + auto p = create_param(args, param, handle->cublas_handle(), + handle->stream()); + + auto bundle = get_bundle(args); + bundle.set(args.workspace.raw_ptr); + + const float* dev_im = args.im_tensor.ptr(); + const float* dev_offset = args.offset_tensor.ptr(); + const float* dev_mask = args.mask_tensor.ptr(); + float* dev_out_grad = args.out_grad_tensor.ptr(); + float* dev_filter_grad = args.filter_grad_tensor.ptr(); + + float* col_ws = static_cast(bundle.get(0)); + float* out_grad_ws = static_cast(bundle.get(1)); + void* bmm_ws = bundle.get(2); + + // im2col + deformable_conv::im2col(dev_im, dev_offset, dev_mask, col_ws, p); + // relayout + auto&& dt = args.im_layout.dtype; + size_t dim0 = p.batch_sz, dim1 = p.OC, dim2 = p.OH * p.OW; + TensorLayout C2l({dim0, dim1, dim2}, dt), C3l = C2l; + C3l.stride[0] = dim2; + C3l.stride[1] = dim0 * dim2; + C3l.stride[2] = 1; + TensorND C2(dev_out_grad, C2l); + TensorND C3(out_grad_ws, C3l); + + args.handle->relayout_opr()->exec(C2, C3); + // matmul + TensorLayout al, bl, cl; + get_matmul_layout(args, al, bl, cl); + + TensorND A(static_cast(out_grad_ws), al), + B(static_cast(col_ws), bl), + C(static_cast(dev_filter_grad), cl); + + size_t bmm_ws_size = bundle.get_size(2); + auto&& bmm_opr = args.handle->create_operator(); + + bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + bmm_opr->param().transposeB = true; + + bmm_opr->exec( + A, B, C, + Workspace(static_cast(bmm_ws), bmm_ws_size)); +} +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/fwd/algo.cpp b/dnn/src/cuda/deformable_conv/fwd/algo.cpp new file mode 100644 index 00000000..f26c80e3 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/fwd/algo.cpp @@ -0,0 +1,80 @@ +/** + * \file dnn/src/cuda/deformable_conv/fwd/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include "src/cuda/deformable_conv/fwd/algo.h" + +using namespace megdnn; +using namespace cuda; + +using OprImpl = DeformableConvForwardImpl; + +OprImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&algo_matmul); +} + +OprImpl::AlgoPack OprImpl::sm_algo_pack; + +OprImpl::AlgoBase::SizeArgs::SizeArgs(OprImpl* o, const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst) + : SizeArgs(o, im, + o->make_canonized_filter_meta(im.ndim, filter, offset), + offset, mask, dst) {} + +OprImpl::AlgoBase::SizeArgs::SizeArgs(OprImpl* o, const TensorLayout& im, + const CanonizedFilterMeta& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst) + : opr(o), + handle(concrete_handle(o->handle())), + im_layout(im), + filter_meta(filter), + offset_layout(offset), + mask_layout(mask), + dst_layout(dst) {} + +OprImpl::AlgoBase::ExecArgs::ExecArgs(OprImpl* opr, _megdnn_tensor_in im, + _megdnn_tensor_in filter, + _megdnn_tensor_in offset, + _megdnn_tensor_in mask, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) + : SizeArgs(opr, im.layout, filter.layout, offset.layout, mask.layout, + dst.layout), + im_tensor(im), + filter_tensor(filter), + offset_tensor(offset), + mask_tensor(mask), + dst_tensor(dst), + workspace(workspace) {} + +std::string OprImpl::AlgoBase::SizeArgs::to_string() const { + auto&& fm = filter_meta; + MEGDNN_MARK_USED_VAR(fm); + return ssprintf( + "im=%s, filter=%u{%u,%u,%u,%u}, offset=%s, mask=%s, dst=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", + im_layout.to_string().c_str(), fm.group, fm.ocpg, fm.icpg, + fm.spatial[0], fm.spatial[1], offset_layout.to_string().c_str(), + mask_layout.to_string().c_str(), dst_layout.to_string().c_str(), + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], !fm.should_flip, + im_layout.dtype.name(), dst_layout.dtype.name()); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/fwd/algo.h b/dnn/src/cuda/deformable_conv/fwd/algo.h new file mode 100644 index 00000000..768b49e5 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/fwd/algo.h @@ -0,0 +1,110 @@ +/** + * \file dnn/src/cuda/deformable_conv/fwd/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/cuda/deformable_conv/opr_impl.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +class DeformableConvForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + DeformableConvForwardImpl* opr; + HandleImpl* handle; + const TensorLayout& im_layout; + CanonizedFilterMeta filter_meta; + const TensorLayout& offset_layout; + const TensorLayout& mask_layout; + const TensorLayout& dst_layout; + + std::string to_string() const; + SizeArgs(DeformableConvForwardImpl* opr, const TensorLayout& im, + const TensorLayout& filter, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& dst); + SizeArgs(DeformableConvForwardImpl* opr, const TensorLayout& im, + const CanonizedFilterMeta& filter, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& dst); + }; + struct ExecArgs : public SizeArgs { + const TensorND &im_tensor, filter_tensor, offset_tensor, mask_tensor, + dst_tensor; + Workspace workspace; + + ExecArgs(DeformableConvForwardImpl* opr, _megdnn_tensor_in im, + _megdnn_tensor_in filter, _megdnn_tensor_in offset, + _megdnn_tensor_in mask, _megdnn_tensor_out dst, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "deformable_conv fwd algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class DeformableConvForwardImpl::AlgoMatmul final : public AlgoBase { +private: + static void get_matmul_layout(const SizeArgs& args, TensorLayout& al, + TensorLayout& bl, TensorLayout& cl); + static WorkspaceBundle get_bundle(const SizeArgs& args); + +public: + AlgoMatmul(){}; + + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { return "AlgoMatmul"; } +}; + +class DeformableConvForwardImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + AlgoMatmul algo_matmul; + //! all algorithms + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp b/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp new file mode 100644 index 00000000..b3b49e16 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp @@ -0,0 +1,153 @@ +/** + * \file dnn/src/cuda/deformable_conv/fwd/algo_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/handle.h" + +#include "src/cuda/batched_matrix_mul/algo.h" +#include "src/cuda/deformable_conv/fwd/algo.h" +#include "src/cuda/deformable_conv/kimpl/deformable_conv.cuh" + +using namespace megdnn; +using namespace cuda; + +using Algo = DeformableConvForwardImpl::AlgoMatmul; +using OprParam = DeformableConvBase::Param; + +namespace { +deformable_conv::Param create_param(const Algo::SizeArgs& args, + const OprParam& opr_param, + cublasHandle_t handle, + cudaStream_t stream) { + deformable_conv::Param p; + auto&& fm = args.filter_meta; + + p.handle = handle; + p.stream = stream; + p.group = fm.group; + p.deformable_group = fm.deformable_group; + p.batch_sz = args.im_layout[0]; + + p.IC = args.im_layout[1]; + p.IH = args.im_layout[2]; + p.IW = args.im_layout[3]; + p.OC = args.dst_layout[1]; + p.OH = args.dst_layout[2]; + p.OW = args.dst_layout[3]; + p.FH = fm.spatial[0]; + p.FW = fm.spatial[1]; + p.PH = opr_param.pad_h; + p.PW = opr_param.pad_w; + p.SH = opr_param.stride_h; + p.SW = opr_param.stride_w; + p.DH = opr_param.dilate_h; + p.DW = opr_param.dilate_w; + + p.icpg = p.IC / p.group; + p.icpdg = p.IC / p.deformable_group; + p.ocpg = p.OC / p.group; + p.ocpdg = p.OC / p.deformable_group; + + return p; +} +}; // anonymous namespace + +bool Algo::is_available(const SizeArgs&) const { + return true; +} + +void Algo::get_matmul_layout(const SizeArgs& args, TensorLayout& al, + TensorLayout& bl, TensorLayout& cl) { + auto&& dt = args.im_layout.dtype; + auto&& fm = args.filter_meta; + size_t batch_sz = args.im_layout[0], OH = args.dst_layout[2], + OW = args.dst_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + size_t M = fm.ocpg, N = OH * OW * batch_sz, K = fm.icpg * FH * FW, + batch = fm.group; + al = {{batch, M, K}, dt}; + bl = {{batch, K, N}, dt}; + cl = {{batch, M, N}, dt}; +} + +WorkspaceBundle Algo::get_bundle(const SizeArgs& args) { + auto&& fm = args.filter_meta; + size_t batch_sz = args.im_layout[0], IC = fm.group * fm.icpg, + OC = args.dst_layout[1], OH = args.dst_layout[2], + OW = args.dst_layout[3], FH = fm.spatial[0], FW = fm.spatial[1]; + + auto&& bmm_opr = args.handle->create_operator(); + TensorLayout al, bl, cl; + + get_matmul_layout(args, al, bl, cl); + bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + + size_t col_ws = batch_sz * IC * FH * FW * OH * OW * sizeof(float); + size_t bmm_ws = bmm_opr->get_workspace_in_bytes(al, bl, cl); + size_t result_ws = batch_sz * OC * OH * OW * sizeof(float); + + return {nullptr, {col_ws, bmm_ws, result_ws}}; +} + +size_t Algo::get_workspace_in_bytes(const SizeArgs& args) const { + return get_bundle(args).total_size_in_bytes(); +} + +void Algo::exec(const ExecArgs& args) const { + auto&& opr = args.opr; + auto&& param = opr->param(); + auto&& handle = concrete_handle(opr->handle()); + + auto p = create_param(args, param, handle->cublas_handle(), + handle->stream()); + + const float* dev_im = args.im_tensor.ptr(); + float* dev_filter = args.filter_tensor.ptr(); + const float* dev_offset = args.offset_tensor.ptr(); + const float* dev_mask = args.mask_tensor.ptr(); + float* dev_out = args.dst_tensor.ptr(); + void* dev_ws = args.workspace.raw_ptr; + + auto bundle = get_bundle(args); + bundle.set(dev_ws); + void* col_ws = bundle.get(0); + void* bmm_ws = bundle.get(1); + void* result_ws = bundle.get(2); + // im2col + deformable_conv::im2col(dev_im, dev_offset, dev_mask, + static_cast(col_ws), p); + // matmul + TensorLayout al, bl, cl; + get_matmul_layout(args, al, bl, cl); + + TensorND A(static_cast(dev_filter), al), + B(static_cast(col_ws), bl), + C(static_cast(result_ws), cl); + + size_t bmm_ws_size = bundle.get_size(1); + auto&& bmm_opr = args.handle->create_operator(); + bmm_opr->param().compute_mode = param::MatrixMul::ComputeMode::DEFAULT; + bmm_opr->exec( + A, B, C, + Workspace(static_cast(bmm_ws), bmm_ws_size)); + // relayout + auto&& dt = args.im_layout.dtype; + size_t dim0 = p.OC, dim1 = p.batch_sz, dim2 = p.OH * p.OW; + TensorLayout C2l({dim0, dim1, dim2}, dt), C3l = C2l; + C3l.stride[0] = dim2; + C3l.stride[1] = dim0 * dim2; + C3l.stride[2] = 1; + TensorND C2(result_ws, C2l); + TensorND C3(dev_out, C3l); + + args.handle->relayout_opr()->exec(C2, C3); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cu b/dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cu new file mode 100644 index 00000000..6ac9ed3c --- /dev/null +++ b/dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cu @@ -0,0 +1,375 @@ +/** + * \file dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/deformable_conv/kimpl/deformable_conv.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace deformable_conv; + +namespace { + +__device__ float dmcn_im2col_bilinear(const float* bottom_data, + const int data_width, const int height, + const int width, float h, float w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w, + const int h, const int w, + const int height, const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w, + const int height, const int width, + const float* im_data, + const int data_width, + const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + return 0; + } + + int argmax_h_low = floorf(argmax_h); + int argmax_w_low = floorf(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +__global__ void deformable_im2col(Param p, const float* im, const float* offset, + const float* mask, float* col) { + size_t n = blockIdx.y; + const size_t N = p.batch_sz; + const size_t loops = p.IC * p.OH * p.OW; + const size_t im_bs = p.IC * p.IH * p.IW; + const size_t offset_bs = 2 * p.deformable_group * p.FH * p.FW * p.OH * p.OW; + const size_t mask_bs = p.deformable_group * p.FH * p.FW * p.OH * p.OW; + + im = &im[n * im_bs]; + offset = &offset[n * offset_bs]; + mask = &mask[n * mask_bs]; + + KERN_FOR(idx, loops) { + const int ow = idx % p.OW; + const int oh = (idx / p.OW) % p.OH; + const int ic = (idx / p.OW / p.OH); + const int dg = ic / p.icpdg; + const int ih = oh * p.SH - p.PH; + const int iw = ow * p.SW - p.PW; + + const float* im_ptr = &im[ic * p.IH * p.IW]; + const float* offset_ptr = + &offset[(dg * 2 * p.FH * p.FW * p.OH + oh) * p.OW + ow]; + const float* mask_ptr = + &mask[(dg * p.FH * p.FW * p.OH + oh) * p.OW + ow]; + float* col_ptr = + &col[((((ic * p.FH * p.FW) * N + n) * p.OH + oh) * p.OW + ow)]; + + for (int i = 0; i < p.FH; ++i) + for (int j = 0; j < p.FW; ++j) { + const float off_h = + offset_ptr[(2 * (i * p.FW + j)) * p.OH * p.OW]; + const float off_w = + offset_ptr[(2 * (i * p.FW + j) + 1) * p.OH * p.OW]; + const float m = mask_ptr[(i * p.FW + j) * p.OH * p.OW]; + + float val = 0.f; + const float h = ih + i * p.DH + off_h; + const float w = iw + j * p.DW + off_w; + if (h > -1 && h < p.IH && w > -1 && w < p.IW) + val = dmcn_im2col_bilinear(im_ptr, p.IW, p.IH, p.IW, h, w); + col_ptr[(i * p.FW + j) * N * p.OH * p.OW] = val * m; + } + } +} + +__global__ void deformable_col2im(Param p, const float* col, + const float* offset, const float* mask, + float* im) { + size_t dg = blockIdx.y % p.deformable_group; + size_t n = blockIdx.y / p.deformable_group; + const size_t loops = p.FH * p.FW * p.OH * p.OW; + const size_t N = p.batch_sz; + const size_t im_bs = p.IC * p.IH * p.IW; + const size_t offset_bs = 2 * p.deformable_group * p.FH * p.FW * p.OH * p.OW; + const size_t mask_bs = p.deformable_group * p.FH * p.FW * p.OH * p.OW; + + offset = &offset[n * offset_bs]; + mask = &mask[n * mask_bs]; + im = &im[n * im_bs]; + + KERN_FOR(idx, loops) { + const int ow = (idx) % p.OW; + const int oh = (idx / p.OW) % p.OH; + const int fw = (idx / p.OW / p.OH) % p.FW; + const int fh = (idx / p.OW / p.OH / p.FW) % p.FH; + + const float* offset_ptr = &offset[dg * 2 * p.FH * p.FW * p.OH * p.OW]; + const float* mask_ptr = &mask[dg * p.FH * p.FW * p.OH * p.OW]; + + const int off_h_idx = ((2 * (fh * p.FW + fw)) * p.OH + oh) * p.OW + ow; + const int off_w_idx = + ((2 * (fh * p.FW + fw) + 1) * p.OH + oh) * p.OW + ow; + const int mask_idx = ((fh * p.FW + fw) * p.OH + oh) * p.OW + ow; + + const float off_h = offset_ptr[off_h_idx]; + const float off_w = offset_ptr[off_w_idx]; + const float m = mask_ptr[mask_idx]; + + const size_t ic_l = dg * p.icpdg, ic_r = (dg + 1) * p.icpdg; + + for (int ic = ic_l; ic < ic_r; ++ic) { + const int ih = oh * p.SH - p.PH; + const int iw = ow * p.SW - p.PW; + + const int col_idx = + (((((ic * p.FH) + fh) * p.FW + fw) * N + n) * p.OH + oh) * + p.OW + + ow; + const float top_grad = col[col_idx] * m; + + const float h = ih + fh * p.DH + off_h; + const float w = iw + fw * p.DW + off_w; + + const int h_hat = (int)h, w_hat = (int)w; +#pragma unroll + for (int dy = -2; dy <= 2; + dy++) { // use 0-1 is better, same for dx +#pragma unroll + for (int dx = -2; dx <= 2; dx++) { + if (h_hat + dy >= 0 && h_hat + dy < p.IH && + w_hat + dx >= 0 && w_hat + dx < p.IW && + abs(h - (h_hat + dy)) < 1 && + abs(w - (w_hat + dx)) < 1) { + int bottom_pos = + (ic * p.IH + h_hat + dy) * p.IW + w_hat + dx; + float weight = dmcn_get_gradient_weight( + h, w, h_hat + dy, w_hat + dx, p.IH, p.IW); + atomicAdd(&im[bottom_pos], weight * top_grad); + } + } + } + } + } +} + +__global__ void deformable_col2coord(Param p, const float* im, const float* col, + const float* offset, const float* mask, + float* offset_grad, float* mask_grad) { + size_t n = blockIdx.y; + const size_t N = p.batch_sz; + const size_t loops = p.deformable_group * p.FH * p.FW * 2 * p.OH * p.OW; + const size_t im_bs = p.IC * p.IH * p.IW; + const size_t offset_bs = p.deformable_group * p.FH * p.FW * 2 * p.OH * p.OW; + const size_t mask_bs = p.deformable_group * p.FH * p.FW * p.OH * p.OW; + + im = &im[n * im_bs]; + offset = &offset[n * offset_bs]; + mask = &mask[n * mask_bs]; + + offset_grad = &offset_grad[n * offset_bs]; + mask_grad = &mask_grad[n * mask_bs]; + + KERN_FOR(idx, loops) { + float val = 0, mval = 0; + const int hw = idx % 2; + const int ow = (idx / 2) % p.OW; + const int oh = (idx / 2 / p.OW) % p.OH; + const int fw = (idx / 2 / p.OW / p.OH) % p.FW; + const int fh = (idx / 2 / p.OW / p.OH / p.FW) % p.FH; + const int dg = + (idx / 2 / p.OW / p.OH / p.FW / p.FH) % p.deformable_group; + + const int ih = oh * p.SH - p.PH; + const int iw = ow * p.SW - p.PW; + + const float* offset_ptr = &offset[dg * 2 * p.FH * p.FW * p.OH * p.OW]; + const float* mask_ptr = &mask[dg * p.FH * p.FW * p.OH * p.OW]; + + float* offset_grad_ptr = + &offset_grad[dg * 2 * p.FH * p.FW * p.OH * p.OW]; + float* mask_grad_ptr = &mask_grad[dg * p.FH * p.FW * p.OH * p.OW]; + + const int offset_h_idx = + ((2 * (fh * p.FW + fw)) * p.OH + oh) * p.OW + ow; + const int offset_w_idx = + ((2 * (fh * p.FW + fw) + 1) * p.OH + oh) * p.OW + ow; + const int mask_idx = ((fh * p.FW + fw) * p.OH + oh) * p.OW + ow; + const int offset_grad_idx = (hw == 0) ? offset_h_idx : offset_w_idx; + + const float off_h = offset_ptr[offset_h_idx]; + const float off_w = offset_ptr[offset_w_idx]; + const float m = mask_ptr[mask_idx]; + + float h = ih + fh * p.DH + off_h; + float w = iw + fw * p.DW + off_w; + + const int ic_l = dg * p.icpdg, ic_r = (dg + 1) * p.icpdg; + + for (int ic = ic_l; ic < ic_r; ++ic) { + const float* im_ptr = &im[ic * p.IH * p.IW]; + const int col_idx = + (((((ic * p.FH + fh) * p.FW + fw) * N + n) * p.OH + oh) * + p.OW + + ow); + const float col_grad = col[col_idx]; + + if (h <= -1 || w <= -1 || h >= p.IH || w >= p.IW) { + h = w = -2; + } else if (hw % 2 == 0) { + mval += col_grad * + dmcn_im2col_bilinear(im_ptr, p.IW, p.IH, p.IW, h, w); + } + const float top_grad = col_grad * m; + const float weight = dmcn_get_coordinate_weight(h, w, p.IH, p.IW, + im_ptr, p.IW, hw); + val += weight * top_grad; + } + + offset_grad_ptr[offset_grad_idx] = val; + if (hw % 2 ==0) { + mask_grad_ptr[mask_idx] = mval; + } + } +} + +} // namespace + +namespace megdnn { +namespace cuda { +namespace deformable_conv { + +void im2col(const float* dev_im, const float* dev_offset, const float* dev_mask, + float* dev_col, const Param& p) { + dim3 grid; + size_t loops = p.IC * p.OH * p.OW; + int nr_thds = query_blocksize_for_kernel(deformable_im2col); + + grid.x = DIVUP(loops, nr_thds), grid.y = p.batch_sz; + + deformable_im2col<<>>(p, dev_im, dev_offset, + dev_mask, dev_col); + after_kernel_launch(); +} + +void col2im(const float* dev_col, const float* dev_offset, + const float* dev_mask, float* dev_im_grad, const Param& p) { + dim3 grid; + size_t loops = p.FH * p.FW * p.OH * p.OW; + int nr_thds = query_blocksize_for_kernel(deformable_col2im); + + grid.x = DIVUP(loops, nr_thds), grid.y = p.batch_sz * p.deformable_group; + + deformable_col2im<<>>(p, dev_col, dev_offset, + dev_mask, dev_im_grad); + after_kernel_launch(); +} + +void col2im_coord(const float* dev_im, const float* dev_col, + const float* dev_offset, const float* dev_mask, + float* dev_offset_grad, float* dev_mask_grad, + const Param& p) { + dim3 grid; + size_t loops = 2 * p.FH * p.FW * p.OH * p.OW * p.deformable_group; + int nr_thds = query_blocksize_for_kernel(deformable_col2coord); + + grid.x = DIVUP(loops, nr_thds); + grid.y = p.batch_sz; + + deformable_col2coord<<>>( + p, dev_im, dev_col, dev_offset, dev_mask, dev_offset_grad, + dev_mask_grad); + after_kernel_launch(); +} + +} // namespace deformable_conv +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cuh b/dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cuh new file mode 100644 index 00000000..886fdd38 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cuh @@ -0,0 +1,53 @@ +/** + * \file dnn/src/cuda/deformable_conv/kimpl/deformable_conv.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/basic_types.h" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace deformable_conv { + +struct Param { + int batch_sz; + int group; + int deformable_group; + int icpg; + int icpdg; + int ocpg; + int ocpdg; + int IC, IH, IW; + int OC, OH, OW; + int FH, FW; + int PH, PW; + int SH, SW; + int DH, DW; + cudaStream_t stream; + cublasHandle_t handle; +}; + +void im2col(const float* dev_im, const float* dev_offset, const float* dev_mask, + float* dev_col, const Param& p); + +void col2im(const float* dev_col, const float* dev_offset, + const float* dev_mask, float* dev_im_grad, const Param& p); + +void col2im_coord(const float* dev_im, const float* dev_col, + const float* dev_offset, const float* dev_mask, + float* dev_offset_grad, float* mask_grad, const Param& p); + +} // namespace deformable_conv +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/deformable_conv/opr_impl.cpp b/dnn/src/cuda/deformable_conv/opr_impl.cpp new file mode 100644 index 00000000..2fb2d2d6 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/opr_impl.cpp @@ -0,0 +1,234 @@ +/** + * \file dnn/src/cuda/deformable_conv/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/deformable_conv/fwd/algo.h" +#include "src/cuda/deformable_conv/bwd_flt/algo.h" +#include "src/cuda/deformable_conv/bwd_data/algo.h" + +#include "src/common/algo_chooser.h" +#include "src/common/utils.h" +#include "src/cuda/deformable_conv/opr_impl.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +using Fwd = DeformableConvForwardImpl; +using BwdFlt = DeformableConvBackwardFilterImpl; +using BwdData = DeformableConvBackwardDataImpl; + +using AlgoFwd = Fwd::Algorithm; +using AlgoBwdFlt = BwdFlt::Algorithm; +using AlgoBwdData = BwdData::Algorithm; + +/* ============== Fwd Implementation ============== */ + +size_t Fwd::get_workspace_in_bytes(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst) { + auto algo = get_algorithm(this, im, filter, offset, mask, dst); + return algo->get_workspace_in_bytes({this, im, filter, offset, mask, dst}); +} + +std::vector Fwd::get_all_algorithms(const TensorLayout& /* im */, + const TensorLayout& /* filter */, + const TensorLayout& /* offset */, + const TensorLayout& /* mask */, + const TensorLayout& /* dst */) { + std::vector algos; + + for (auto i : sm_algo_pack.all_algos) + algos.push_back(static_cast(i)); + + return algos; +} + +AlgoFwd* Fwd::get_algorithm_heuristic(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) { + auto fm = make_canonized_filter_meta(im.ndim, filter, offset); + return get_algorithm_heuristic(im, fm, offset, mask, dst, + workspace_limit_in_bytes, reproducible); +} + +AlgoFwd* Fwd::get_algorithm_heuristic(const TensorLayout& im, + const CanonizedFilterMeta& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, im, filter, offset, mask, dst); + if (sm_algo_pack.algo_matmul.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.algo_matmul; + } + megdnn_throw(megdnn_mangle( + ssprintf("no %s deformable conv fwd algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", + args.to_string().c_str(), workspace_limit_in_bytes))); +} + +const char* Fwd::get_algorithm_set_name() const { + return "DEFORMABLE_CONV_FWD_CUDA"; +}; + +void Fwd::exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_out out, _megdnn_workspace workspace) { + auto algo = get_algorithm(this, im.layout, filter.layout, offset.layout, + mask.layout, out.layout); + + AlgoBase::ExecArgs args(this, im, filter, offset, mask, out, workspace); + + algo->check_workspace(args, workspace).exec(args); + return; +} + +/* ============== BwdFlt Implementation ============== */ + +std::vector BwdFlt::get_all_algorithms(const TensorLayout& /* im */, + const TensorLayout& /* offset */, const TensorLayout& /* mask */, + const TensorLayout& /* out_grad */, const TensorLayout& /* filter_grad */) { + std::vector algos; + for (auto i : sm_algo_pack.all_algos) + algos.push_back(static_cast(i)); + return algos; +} + +AlgoBwdFlt* BwdFlt::get_algorithm_heuristic( + const TensorLayout& im, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + const TensorLayout& filter_grad, + size_t workspace_limit_in_bytes, bool reproducible) { + auto fm = make_canonized_filter_meta(im.ndim, filter_grad, offset); + return get_algorithm_heuristic(im, offset, mask, out_grad, fm, + workspace_limit_in_bytes, reproducible); +} + +AlgoBwdFlt* BwdFlt::get_algorithm_heuristic( + const TensorLayout& im, const TensorLayout& offset, + const TensorLayout& mask, const TensorLayout& out_grad, + const CanonizedFilterMeta& filter_grad, + size_t workspace_limit_in_bytes, bool reproducible) { + AlgoBase::SizeArgs args(this, im, offset, mask, out_grad, filter_grad); + if (sm_algo_pack.algo_matmul.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.algo_matmul; + } + megdnn_throw(megdnn_mangle(ssprintf( + "no %s deformable conv bwd filter algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", args.to_string().c_str(), + workspace_limit_in_bytes))); +} + +size_t BwdFlt::get_workspace_in_bytes( + const TensorLayout& im, const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& filter_grad) { + AlgoBase::SizeArgs args(); + auto algo = get_algorithm(this, im, offset, mask, out_grad, filter_grad); + return algo->get_workspace_in_bytes({this, im, offset, mask, out_grad, filter_grad}); +} + +const char* BwdFlt::get_algorithm_set_name() const { + return "DEFORMABLE_CONV_BWD_FILTER_CUDA"; +}; + +void BwdFlt::exec(_megdnn_tensor_in im, _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out filter_grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, im, offset, mask, out_grad, filter_grad, workspace); + auto algo = get_algorithm(this, im.layout, offset.layout, mask.layout, out_grad.layout, + filter_grad.layout); + algo->check_workspace(args, workspace).exec(args); +} + +/* ============== BwdData Implementation ============== */ + +std::vector BwdData::get_all_algorithms( + const TensorLayout& /* im */, const TensorLayout& /* filter */, + const TensorLayout& /* offset */, const TensorLayout& /* mask */, const TensorLayout& /* out_grad */, + const TensorLayout& /* im_grad */, const TensorLayout& /* offset_grad */, const TensorLayout& /* mask_grad */) { + std::vector algos; + for (auto i : sm_algo_pack.all_algos) + algos.push_back(static_cast(i)); + return algos; +} + +AlgoBwdData* BwdData::get_algorithm_heuristic( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad, + size_t workspace_limit_in_bytes, bool reproducible) { + auto fm = make_canonized_filter_meta(im.ndim, filter, offset); + return get_algorithm_heuristic(im, fm, offset, mask, out_grad, im_grad, + offset_grad, mask_grad, + workspace_limit_in_bytes, reproducible); +} + +AlgoBwdData* BwdData::get_algorithm_heuristic( + const TensorLayout& im, const CanonizedFilterMeta& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad, + size_t workspace_limit_in_bytes, bool reproducible) { + AlgoBase::SizeArgs args(this, im, filter, offset, mask, out_grad, im_grad, + offset_grad, mask_grad); + if (sm_algo_pack.algo_matmul.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.algo_matmul; + } + megdnn_throw(megdnn_mangle(ssprintf( + "no %s deformable conv bwd data algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", args.to_string().c_str(), + workspace_limit_in_bytes))); +} + +size_t BwdData::get_workspace_in_bytes( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad) { + AlgoBase::SizeArgs args(); + auto algo = get_algorithm(this, im, filter, offset, mask, out_grad, + im_grad, offset_grad, mask_grad); + return algo->get_workspace_in_bytes({this, im, filter, offset, mask, out_grad, + im_grad, offset_grad, mask_grad}); +} + +const char* BwdData::get_algorithm_set_name() const { + return "DEFORMABLE_CONV2_BWD_DATA_CUDA"; +}; + +void BwdData::exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, + _megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, im, filter, offset, mask, out_grad, im_grad, + offset_grad, mask_grad, workspace); + auto algo = get_algorithm(this, im.layout, filter.layout, offset.layout, + mask.layout, out_grad.layout, im_grad.layout, + offset_grad.layout, mask_grad.layout); + algo->check_workspace(args, workspace).exec(args); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_conv/opr_impl.h b/dnn/src/cuda/deformable_conv/opr_impl.h new file mode 100644 index 00000000..3a6ec138 --- /dev/null +++ b/dnn/src/cuda/deformable_conv/opr_impl.h @@ -0,0 +1,163 @@ +/** + * \file dnn/src/cuda/deformable_conv/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs/nn.h" + +namespace megdnn { +namespace cuda { + +class DeformableConvForwardImpl : public DeformableConvForward { +public: + using DeformableConvForward::DeformableConvForward; + + void exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst) override; + + std::vector get_all_algorithms( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& dst) override; + + Algorithm* get_algorithm_heuristic(const TensorLayout& im, + const TensorLayout& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + + Algorithm* get_algorithm_heuristic(const TensorLayout& im, + const CanonizedFilterMeta& filter, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible); + + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoMatmul; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +class DeformableConvBackwardFilterImpl: public DeformableConvBackwardFilter { +public: + using DeformableConvBackwardFilter::DeformableConvBackwardFilter; + + void exec(_megdnn_tensor_in im,_megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out filter_grad, + _megdnn_workspace workspace) override; + + std::vector get_all_algorithms( + const TensorLayout& im, const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& filter_grad) override; + + Algorithm* get_algorithm_heuristic(const TensorLayout& im, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& out_grad, + const TensorLayout& filter_grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + + Algorithm* get_algorithm_heuristic(const TensorLayout& im, + const TensorLayout& offset, + const TensorLayout& mask, + const TensorLayout& out_grad, + const CanonizedFilterMeta& filter_grad, + size_t workspace_limit_in_bytes, + bool reproducible); + + size_t get_workspace_in_bytes( + const TensorLayout& im, const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& filter_grad) override; + + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoMatmul; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +class DeformableConvBackwardDataImpl : public DeformableConvBackwardData { +public: + using DeformableConvBackwardData::DeformableConvBackwardData; + + void exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, + _megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, + _megdnn_workspace workspace) override; + + std::vector get_all_algorithms( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad) override; + + Algorithm* get_algorithm_heuristic( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad, + size_t workspace_limit_in_bytes, bool reproducible) override; + + Algorithm* get_algorithm_heuristic( + const TensorLayout& im, const CanonizedFilterMeta& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad, + size_t workspace_limit_in_bytes, bool reproducible); + + size_t get_workspace_in_bytes( + const TensorLayout& im, const TensorLayout& filter, + const TensorLayout& offset, const TensorLayout& mask, + const TensorLayout& out_grad, const TensorLayout& im_grad, + const TensorLayout& offset_grad, const TensorLayout& mask_grad) override; + + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoMatmul; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu new file mode 100644 index 00000000..a7877697 --- /dev/null +++ b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu @@ -0,0 +1,311 @@ +/** + * \file dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/deformable_ps_roi_pooling/kimpl/kern.cuh" +#include "src/cuda/query_blocksize.cuh" + +namespace { + +using Param = megdnn::cuda::deformable_ps_roi_pooling::Param; + +__device__ float bilinear_interp(const float* data, const int IH, const int IW, + const float h, const float w) { + int h1 = floor(h), h2 = ceil(h); + int w1 = floor(w), w2 = ceil(w); + float dist_h = (float)(h - h1); + float dist_w = (float)(w - w1); + float value11 = data[h1 * IW + w1]; + float value12 = data[h2 * IW + w1]; + float value21 = data[h1 * IW + w2]; + float value22 = data[h2 * IW + w2]; + float value = (1 - dist_w) * (1 - dist_h) * value11 + + (1 - dist_w) * dist_h * value12 + + dist_w * (1 - dist_h) * value21 + dist_w * dist_h * value22; + return value; +} + +__global__ void DeformablePSROIPoolForwardKern(Param p, const float* data, + const float* rois, + const float* trans, + float* out_data, + float* out_count) { + const int loops = p.nr_bbox * p.IC * p.pool_h * p.pool_w; + const int icpcls = p.IC / p.nr_cls; + + KERN_FOR(idx, loops) { + const int pw = idx % p.pool_w; + const int ph = (idx / p.pool_w) % p.pool_h; + const int ic = (idx / p.pool_w / p.pool_h) % p.IC; + const int n = (idx / p.pool_w / p.pool_h / p.IC); + const float* rois_ptr = &rois[n * 5]; + + int roi_batch_idx = rois_ptr[0]; + + float roi_w_l = static_cast(round(rois_ptr[1])) * p.scale - 0.5; + float roi_h_l = static_cast(round(rois_ptr[2])) * p.scale - 0.5; + float roi_w_r = + static_cast(round(rois_ptr[3]) + 1.) * p.scale - 0.5; + float roi_h_r = + static_cast(round(rois_ptr[4]) + 1.) * p.scale - 0.5; + + // Force too small ROIs to be 1x1 + float roi_w = max(roi_w_r - roi_w_l, 0.1); // avoid 0 + float roi_h = max(roi_h_r - roi_h_l, 0.1); + + // Compute w and h at bottom + float bin_sz_h = roi_h / static_cast(p.pool_h); + float bin_sz_w = roi_w / static_cast(p.pool_w); + + float sub_bin_sz_h = bin_sz_h / static_cast(p.sample_per_part); + float sub_bin_sz_w = bin_sz_w / static_cast(p.sample_per_part); + + int count = 0; + int cls_id = ic / icpcls; + float sum = 0, trans_x = 0, trans_y = 0; + float hstart = static_cast(ph) * bin_sz_h + roi_h_l; + float wstart = static_cast(pw) * bin_sz_w + roi_w_l; + + if (!p.no_trans) { + int part_h = floor(static_cast(ph) / p.pool_h * p.part_sz); + int part_w = floor(static_cast(pw) / p.pool_w * p.part_sz); + int x_idx = (((n * p.nr_cls + cls_id) * 2) * p.part_sz + part_h) * + p.part_sz + + part_w; + int y_idx = + (((n * p.nr_cls + cls_id) * 2 + 1) * p.part_sz + part_h) * + p.part_sz + + part_w; + trans_x = trans[x_idx] * static_cast(p.trans_std); + trans_y = trans[y_idx] * static_cast(p.trans_std); + } + + wstart += trans_x * roi_w; + hstart += trans_y * roi_h; + + const float* data_ptr = + data + (roi_batch_idx * p.IC + ic) * p.IH * p.IW; + + for (int ih = 0; ih < p.sample_per_part; ih++) { + for (int iw = 0; iw < p.sample_per_part; iw++) { + float w = wstart + iw * sub_bin_sz_w; + float h = hstart + ih * sub_bin_sz_h; + // bilinear interpolation + if (w < -0.5 || w > p.IW - 0.5 || h < -0.5 || h > p.IH - 0.5) + continue; + w = min(max(w, 0.), p.IW - 1.); + h = min(max(h, 0.), p.IH - 1.); + float val = bilinear_interp(data_ptr, p.IH, p.IW, h, w); + sum += val, count++; + } + } + out_data[idx] = count == 0 ? (float)(0) : sum / count; + out_count[idx] = count; + } +} + +__global__ void DeformablePSROIPoolBackwardAccKern( + Param p, const float* data, const float* rois, const float* trans, + const float* out_diff, const float* out_count, float* data_diff, + float* trans_diff) { + const int loops = p.nr_bbox * p.IC * p.pool_h * p.pool_w; + const int icpcls = p.IC / p.nr_cls; + + KERN_FOR(idx, loops) { + const int pw = idx % p.pool_w; + const int ph = (idx / p.pool_w) % p.pool_h; + const int ic = (idx / p.pool_w / p.pool_h) % p.IC; + const int n = (idx / p.pool_w / p.pool_h / p.IC); + + const float* rois_ptr = &rois[n * 5]; + + int roi_batch_idx = rois_ptr[0]; + + float roi_w_l = static_cast(round(rois_ptr[1])) * p.scale - 0.5; + float roi_h_l = static_cast(round(rois_ptr[2])) * p.scale - 0.5; + float roi_w_r = + static_cast(round(rois_ptr[3]) + 1.) * p.scale - 0.5; + float roi_h_r = + static_cast(round(rois_ptr[4]) + 1.) * p.scale - 0.5; + + // Force too small ROIs to be 1x1 + float roi_w = max(roi_w_r - roi_w_l, 0.1); // avoid 0 + float roi_h = max(roi_h_r - roi_h_l, 0.1); + + // Compute w and h at bottom + float bin_sz_h = roi_h / static_cast(p.pool_h); + float bin_sz_w = roi_w / static_cast(p.pool_w); + + float sub_bin_sz_h = bin_sz_h / static_cast(p.sample_per_part); + float sub_bin_sz_w = bin_sz_w / static_cast(p.sample_per_part); + + int part_h = 0, part_w = 0, cls_id = ic / icpcls; + float trans_x = 0, trans_y = 0; + float wstart = static_cast(pw) * bin_sz_w + roi_w_l; + float hstart = static_cast(ph) * bin_sz_h + roi_h_l; + + if (!p.no_trans) { + part_h = floor(static_cast(ph) / p.pool_h * p.part_sz); + part_w = floor(static_cast(pw) / p.pool_w * p.part_sz); + int x_idx = (((n * p.nr_cls + cls_id) * 2) * p.part_sz + part_h) * + p.part_sz + + part_w; + int y_idx = + (((n * p.nr_cls + cls_id) * 2 + 1) * p.part_sz + part_h) * + p.part_sz + + part_w; + trans_x = trans[x_idx] * static_cast(p.trans_std); + trans_y = trans[y_idx] * static_cast(p.trans_std); + } + + wstart += trans_x * roi_w; + hstart += trans_y * roi_h; + + if (out_count[idx] <= 0) + continue; + + float diff_val = out_diff[idx] / out_count[idx]; + + const int data_idx = (roi_batch_idx * p.IC + ic) * p.IH * p.IW; + + float* data_diff_ptr; + const float* data_ptr; + + for (int ih = 0; ih < p.sample_per_part; ih++) { + for (int iw = 0; iw < p.sample_per_part; iw++) { + float w = wstart + iw * sub_bin_sz_w; + float h = hstart + ih * sub_bin_sz_h; + // bilinear interpolation + if (w < -0.5 || w > p.IW - 0.5 || h < -0.5 || h > p.IH - 0.5) + continue; + w = min(max(w, 0.), p.IW - 1.), h = min(max(h, 0.), p.IH - 1.); + // backward on feature + int x0 = floor(w), x1 = ceil(w); + int y0 = floor(h), y1 = ceil(h); + float dist_x = w - x0, dist_y = h - y0; + float q00 = (1 - dist_x) * (1 - dist_y); + float q01 = (1 - dist_x) * dist_y; + float q10 = dist_x * (1 - dist_y); + float q11 = dist_x * dist_y; + + data_diff_ptr = &data_diff[data_idx]; + + atomicAdd(&data_diff_ptr[y0 * p.IW + x0], q00 * diff_val); + atomicAdd(&data_diff_ptr[y1 * p.IW + x0], q01 * diff_val); + atomicAdd(&data_diff_ptr[y0 * p.IW + x1], q10 * diff_val); + atomicAdd(&data_diff_ptr[y1 * p.IW + x1], q11 * diff_val); + + if (p.no_trans) + continue; + + data_ptr = &data[data_idx]; + + float U00 = data_ptr[y0 * p.IW + x0]; + float U01 = data_ptr[y1 * p.IW + x0]; + float U10 = data_ptr[y0 * p.IW + x1]; + float U11 = data_ptr[y1 * p.IW + x1]; + + float diff_x = (U11 * dist_y + U10 * (1 - dist_y) - + U01 * dist_y - U00 * (1 - dist_y)) * + p.trans_std * diff_val; + float diff_y = (U11 * dist_x + U01 * (1 - dist_x) - + U10 * dist_x - U00 * (1 - dist_x)) * + p.trans_std * diff_val; + + diff_x *= roi_w, diff_y *= roi_h; + + int diff_x_idx = + (((n * p.nr_cls + cls_id) * 2) * p.part_sz + part_h) * + p.part_sz + + part_w; + int diff_y_idx = + (((n * p.nr_cls + cls_id) * 2 + 1) * p.part_sz + + part_h) * + p.part_sz + + part_w; + + atomicAdd(&trans_diff[diff_x_idx], diff_x); + atomicAdd(&trans_diff[diff_y_idx], diff_y); + } + } + } +} +} // namespace + +namespace megdnn { +namespace cuda { +namespace deformable_ps_roi_pooling { + +void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois, + const TensorND& trans, const TensorND& out_data, + const TensorND& out_count, Param& p) { + const int loops = p.nr_bbox * p.IC * p.pool_h * p.pool_w; + int nr_thds = query_blocksize_for_kernel(DeformablePSROIPoolForwardKern); + const int blks = DIVUP(loops, nr_thds); + + const float* data_ptr = data.ptr(); + const float* rois_ptr = rois.ptr(); + const float* trans_ptr = p.no_trans ? NULL : trans.ptr(); + + float* out_data_ptr = out_data.ptr(); + float* out_count_ptr = out_count.ptr(); + + auto&& out_data_elems = out_data.layout.total_nr_elems(); + auto&& out_count_elems = out_count.layout.total_nr_elems(); + size_t out_data_bytes = sizeof(float[out_data_elems]); + size_t out_count_bytes = sizeof(float[out_count_elems]); + + cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream); + cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream); + + DeformablePSROIPoolForwardKern<<>>( + p, data_ptr, rois_ptr, trans_ptr, out_data_ptr, out_count_ptr); + after_kernel_launch(); +} + +void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois, + const TensorND& trans, + const TensorND& out_diff, + const TensorND& out_count, + const TensorND& data_diff, + const TensorND& trans_diff, Param& p) { + const int loops = p.nr_bbox * p.IC * p.pool_h * p.pool_w; + int nr_thds = + query_blocksize_for_kernel(DeformablePSROIPoolBackwardAccKern); + const int blks = DIVUP(loops, nr_thds); + + const float* data_ptr = data.ptr(); + const float* rois_ptr = rois.ptr(); + const float* trans_ptr = p.no_trans ? NULL : trans.ptr(); + const float* out_diff_ptr = out_diff.ptr(); + const float* out_count_ptr = out_count.ptr(); + + float* data_diff_ptr = data_diff.ptr(); + float* trans_diff_ptr = trans_diff.ptr(); + + auto&& data_diff_elems = data_diff.layout.total_nr_elems(); + auto&& trans_diff_elems = trans_diff.layout.total_nr_elems(); + size_t data_diff_bytes = sizeof(float[data_diff_elems]); + size_t trans_diff_bytes = sizeof(float[trans_diff_elems]); + + cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream); + cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream); + + DeformablePSROIPoolBackwardAccKern<<>>( + p, data_ptr, rois_ptr, trans_ptr, out_diff_ptr, out_count_ptr, + data_diff_ptr, trans_diff_ptr); + after_kernel_launch(); +} + +} // namespace deformable_ps_roi_pooling +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cuh b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cuh new file mode 100644 index 00000000..c5a5e09c --- /dev/null +++ b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cuh @@ -0,0 +1,49 @@ +/** + * \file dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/basic_types.h" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace deformable_ps_roi_pooling { + +struct Param { + bool no_trans; + int IC; + int IH; + int IW; + int nr_cls, nr_bbox; + int pool_h, pool_w; + int part_sz, sample_per_part; + float scale; + float trans_std; + cudaStream_t stream; +}; + +void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois, + const TensorND& trans, const TensorND& out_data, + const TensorND& out_count, Param& p); + +void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois, + const TensorND& trans, + const TensorND& out_diff, + const TensorND& out_count, + const TensorND& data_diff, + const TensorND& trans_diff, Param& p); + +} // namespace deformable_ps_roi_pooling +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.cpp b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.cpp new file mode 100644 index 00000000..526ebaa6 --- /dev/null +++ b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.cpp @@ -0,0 +1,81 @@ +/** + * \file dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/deformable_ps_roi_pooling/kimpl/kern.cuh" +#include "src/cuda/deformable_ps_roi_pooling/opr_impl.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using KernParam = deformable_ps_roi_pooling::Param; + +namespace { + +void create_param(const DeformablePSROIPoolingBase* opr, + const TensorLayout& data, const TensorLayout& rois, + const TensorLayout& trans, KernParam& p) { + auto&& param = opr->param(); + auto&& handle = concrete_handle(opr->handle()); + + p.stream = handle->stream(); + p.no_trans = param.no_trans; + p.pool_h = param.pooled_h; + p.pool_w = param.pooled_w; + p.part_sz = param.part_size; + p.sample_per_part = param.sample_per_part; + p.trans_std = param.trans_std; + p.scale = param.spatial_scale; + p.nr_cls = p.no_trans ? 1 : trans[0]; + p.nr_bbox = rois[0]; + p.IC = data[1]; + p.IH = data[2]; + p.IW = data[3]; +} + +} // namespace + +namespace megdnn { +namespace cuda { + +void DeformablePSROIPoolingForwardImpl::exec(_megdnn_tensor_in data, + _megdnn_tensor_in rois, + _megdnn_tensor_in trans, + _megdnn_tensor_out out_data, + _megdnn_tensor_out out_count, + _megdnn_workspace workspace) { + KernParam p; + + check_exec(data.layout, rois.layout, trans.layout, out_data.layout, + out_count.layout, workspace.size); + + create_param(this, data.layout, rois.layout, trans.layout, p); + deformable_ps_roi_pooling::DeformablePSROIPoolForward( + data, rois, trans, out_data, out_count, p); +} + +void DeformablePSROIPoolingBackwardImpl::exec( + _megdnn_tensor_in data, _megdnn_tensor_in rois, _megdnn_tensor_in trans, + _megdnn_tensor_in out_diff, _megdnn_tensor_in out_count, + _megdnn_tensor_out data_diff, _megdnn_tensor_out trans_diff, + _megdnn_workspace workspace) { + KernParam p; + + check_exec(data.layout, rois.layout, trans.layout, out_diff.layout, + out_count.layout, data_diff.layout, trans_diff.layout, + workspace.size); + create_param(this, data.layout, rois.layout, trans.layout, p); + deformable_ps_roi_pooling::DeformablePSROIPoolBackwardAcc( + data, rois, trans, out_diff, out_count, data_diff, trans_diff, p); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h new file mode 100644 index 00000000..245604de --- /dev/null +++ b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h @@ -0,0 +1,60 @@ +/** + * \file dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class DeformablePSROIPoolingForwardImpl final + : public DeformablePSROIPoolingForward { +public: + using DeformablePSROIPoolingForward::DeformablePSROIPoolingForward; + + size_t get_workspace_in_bytes( + const TensorLayout& /* data */, const TensorLayout& /* rois */, + const TensorLayout& /* trans */, const TensorLayout& /* out_data */, + const TensorLayout& /* out_count */) override { + return 0ULL; + }; + + void exec(_megdnn_tensor_in data, _megdnn_tensor_in rois, + _megdnn_tensor_in trans, _megdnn_tensor_out out_data, + _megdnn_tensor_out out_count, + _megdnn_workspace workspace) override; +}; + +class DeformablePSROIPoolingBackwardImpl final + : public DeformablePSROIPoolingBackward { +public: + using DeformablePSROIPoolingBackward::DeformablePSROIPoolingBackward; + + size_t get_workspace_in_bytes(const TensorLayout& /* data */, + const TensorLayout& /* rois */, + const TensorLayout& /* trans */, + const TensorLayout& /* out_diff */, + const TensorLayout& /* out_count */, + const TensorLayout& /* data_diff */, + const TensorLayout& /* trans_diff */) { + return 0ULL; + }; + + void exec(_megdnn_tensor_in data, _megdnn_tensor_in rois, + _megdnn_tensor_in trans, _megdnn_tensor_in out_diff, + _megdnn_tensor_in out_count, _megdnn_tensor_out data_diff, + _megdnn_tensor_out trans_diff, + _megdnn_workspace workspace) override; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/dot/dot.cu b/dnn/src/cuda/dot/dot.cu new file mode 100644 index 00000000..91bb00ae --- /dev/null +++ b/dnn/src/cuda/dot/dot.cu @@ -0,0 +1,90 @@ +/** + * \file dnn/src/cuda/dot/dot.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/dot/dot.cuh" + +#include "src/cuda/utils.cuh" +#include "src/cuda/cub/util_ptx.cuh" + +namespace { + +using namespace megdnn; + +template __global__ void kernel(const T *a, const T *b, + dt_float32 *c, + uint32_t n, int32_t strideA, int32_t strideB) +{ + uint32_t tid = threadIdx.x; + uint32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + volatile __shared__ dt_float32 sdata[256]; + sdata[tid] = (gid < n ? + dt_float32(a[gid*strideA]) * dt_float32(b[gid*strideB]) + : 0); + __syncthreads(); + if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); + if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); + if (tid < 32) { + sdata[tid] += sdata[tid + 32]; + cub::WARP_SYNC(0xffffffff); + if (tid < 16) + sdata[tid] += sdata[tid + 16]; + cub::WARP_SYNC(0xffffffff); + if (tid < 8) + sdata[tid] += sdata[tid + 8]; + cub::WARP_SYNC(0xffffffff); + if (tid < 4) + sdata[tid] += sdata[tid + 4]; + cub::WARP_SYNC(0xffffffff); + if (tid < 2) + sdata[tid] += sdata[tid + 2]; + cub::WARP_SYNC(0xffffffff); + if (tid < 1) + sdata[tid] += sdata[tid + 1]; + } + if (tid == 0) + atomicAdd(c, sdata[0]); +} + +template __global__ void cvt_kernel(const dt_float32 *src, T *dst) +{ + dst[0] = T(src[0]); +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace dot { + +template void run(const T *a, const T *b, T *c, float *workspace, + uint32_t n, int32_t strideA, int32_t strideB, + cudaStream_t stream) +{ + cuda_check(cudaMemsetAsync(workspace, 0, sizeof(dt_float32), stream)); + // each block add 256 entries + uint32_t blocks = DIVUP(n, 256); + uint32_t threads = 256; + kernel<<>>(a, b, + workspace, + n, strideA, strideB); + cvt_kernel<<<1, 1, 0, stream>>>(workspace, c); + after_kernel_launch(); +} + +template void run(const dt_float16 *a, const dt_float16 *b, + dt_float16 *c, dt_float32 *workspace, + uint32_t n, int32_t strideA, int32_t strideB, + cudaStream_t stream); + +} // namespace dot +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/dot/dot.cuh b/dnn/src/cuda/dot/dot.cuh new file mode 100644 index 00000000..2aec60c8 --- /dev/null +++ b/dnn/src/cuda/dot/dot.cuh @@ -0,0 +1,28 @@ +/** + * \file dnn/src/cuda/dot/dot.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/dtype.h" + +namespace megdnn { +namespace cuda { +namespace dot { + +template void run(const T *a, const T *b, T *c, + float *workspace, + uint32_t n, + int32_t strideA, int32_t strideB, + cudaStream_t stream); + +} // namespace dot +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/dot/opr_impl.cpp b/dnn/src/cuda/dot/opr_impl.cpp new file mode 100644 index 00000000..aed291fa --- /dev/null +++ b/dnn/src/cuda/dot/opr_impl.cpp @@ -0,0 +1,46 @@ +/** + * \file dnn/src/cuda/dot/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/dot/opr_impl.h" + +#include "src/cuda/utils.h" +#include "src/cuda/dot/dot.cuh" + +namespace megdnn { +namespace cuda { + +void DotForwardImpl::exec(_megdnn_tensor_in A, + _megdnn_tensor_in B, + _megdnn_tensor_out C, + _megdnn_workspace workspace) +{ + check_exec(A.layout, B.layout, C.layout, workspace.size); + megdnn_assert(A.layout.dtype.category() == DTypeCategory::FLOAT); + auto handle = cublas_handle(this->handle()); + if (A.layout.dtype == dtype::Float32()) { + cublas_check(cublasSdot(handle, A.layout.total_nr_elems(), + A.ptr(), A.layout.stride[0], + B.ptr(), B.layout.stride[0], + C.ptr())); + } else { + megdnn_assert_internal(A.layout.dtype == dtype::Float16()); + dot::run(A.ptr(), + B.ptr(), + C.ptr(), + workspace.ptr(), + A.layout.total_nr_elems(), + A.layout.stride[0], B.layout.stride[0], + cuda_stream(this->handle())); + } +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/dot/opr_impl.h b/dnn/src/cuda/dot/opr_impl.h new file mode 100644 index 00000000..0e3a0146 --- /dev/null +++ b/dnn/src/cuda/dot/opr_impl.h @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/dot/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/cuda/cudnn_wrapper.h" + +namespace megdnn { +namespace cuda { + +class DotForwardImpl final: public DotForward { + public: + using DotForward::DotForward; + void exec(_megdnn_tensor_in A, + _megdnn_tensor_in B, + _megdnn_tensor_out C, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &, + const TensorLayout &) override { + return sizeof(float); + } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise/kern_impl.inl b/dnn/src/cuda/elemwise/kern_impl.inl new file mode 100644 index 00000000..fc7a81cd --- /dev/null +++ b/dnn/src/cuda/elemwise/kern_impl.inl @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/elemwise/kern_impl.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#ifndef KERN_IMPL_MODE +#error "KERN_IMPL_MODE, KERN_IMPL_ARITY and KERN_IMPL_CTYPE must be defined" +#endif + +#include "./kern_wrapper.cuh" + +namespace megdnn { +namespace cuda { + +#define cb(_mode) \ + typedef ElemwiseKern< \ + megcorePlatformCUDA, \ + param_enumv::Elemwise::Mode::_mode, KERN_IMPL_CTYPE> \ + KernImpl##_mode; \ + typedef ElemArithKernWrapper \ + Wrapper##_mode; \ + INST_RUN_ELEMWISE(Wrapper##_mode, KERN_IMPL_CTYPE, KERN_IMPL_ARITY); \ + +KERN_IMPL_MODE(cb) + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise/kern_wrapper.cuh b/dnn/src/cuda/elemwise/kern_wrapper.cuh new file mode 100644 index 00000000..5f666ffc --- /dev/null +++ b/dnn/src/cuda/elemwise/kern_wrapper.cuh @@ -0,0 +1,155 @@ +/** + * \file dnn/src/cuda/elemwise/kern_wrapper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/common/elemwise/kern_defs.cuh" +#include "src/cuda/elemwise_helper.cuh" + +namespace megdnn { +namespace cuda { + + template + struct ElemArithKernWrapper; + + template + struct ElemArithKernWrapper< + 1, KernImpl, + typename std::enable_if< + !std::is_same::value && + !std::is_same::value>::type> { + typedef typename KernImpl::ctype ctype; + ctype* dst; + +#if MEGDNN_CC_CUDA + __device__ void operator()(uint32_t idx, ctype x) { + dst[idx] = KernImpl::apply(x); + } +#endif + }; + template + struct ElemArithKernWrapper< + 2, KernImpl, + typename std::enable_if< + !std::is_same::value && + !std::is_same::value>::type> { + typedef typename KernImpl::ctype ctype; + ctype* dst; + +#if MEGDNN_CC_CUDA + __device__ void operator()(uint32_t idx, ctype x, ctype y) { + dst[idx] = KernImpl::apply(x, y); + } +#endif + }; + template + struct ElemArithKernWrapper< + 3, KernImpl, + typename std::enable_if< + !std::is_same::value && + !std::is_same::value>::type> { + typedef typename KernImpl::ctype ctype; + ctype* dst; + +#if MEGDNN_CC_CUDA + __device__ void operator()(uint32_t idx, ctype x, ctype y, ctype z) { + dst[idx] = KernImpl::apply(x, y, z); + } +#endif + }; + + template + struct ElemArithKernWrapper< + 1, KernImpl, + typename std::enable_if< + std::is_same::value || + std::is_same::value>::type> { + typedef typename KernImpl::ctype ctype; + using VectTypeTrait = elemwise_intl::VectTypeTrait; + typedef typename VectTypeTrait::vect_type vect_type; + ctype* dst; +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void operator()(uint32_t idx, ctype x) { + dst[idx] = KernImpl::apply(x); + } + __device__ __forceinline__ void operator()(uint32_t idx, vect_type x) { + ctype a = KernImpl::apply(x.x); + ctype b = KernImpl::apply(x.y); + ctype g = KernImpl::apply(x.z); + ctype r = KernImpl::apply(x.w); + *(vect_type*)(&dst[idx]) = VectTypeTrait::make_vector(a, b, g, r); + } +#endif + }; + + template + struct ElemArithKernWrapper< + 2, KernImpl, + typename std::enable_if< + std::is_same::value || + std::is_same::value>::type> { + typedef typename KernImpl::ctype ctype; + using VectTypeTrait = elemwise_intl::VectTypeTrait; + typedef typename VectTypeTrait::vect_type vect_type; + ctype* dst; +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void operator()(uint32_t idx, ctype x, + ctype y) { + dst[idx] = KernImpl::apply(x, y); + } + __device__ __forceinline__ void operator()(uint32_t idx, vect_type x, + vect_type y) { + ctype a = KernImpl::apply(x.x, y.x); + ctype b = KernImpl::apply(x.y, y.y); + ctype g = KernImpl::apply(x.z, y.z); + ctype r = KernImpl::apply(x.w, y.w); + *(vect_type*)(&dst[idx]) = VectTypeTrait::make_vector(a, b, g, r); + } +#endif + }; + + template + struct ElemArithKernWrapper< + 3, KernImpl, + typename std::enable_if< + std::is_same::value || + std::is_same::value>::type> { + typedef typename KernImpl::ctype ctype; + using VectTypeTrait = elemwise_intl::VectTypeTrait; + typedef typename VectTypeTrait::vect_type vect_type; + ctype* dst; +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void operator()(uint32_t idx, ctype x, + ctype y, ctype z) { + dst[idx] = KernImpl::apply(x, y, z); + } + __device__ __forceinline__ void operator()(uint32_t idx, vect_type x, + vect_type y, vect_type z) { + ctype a = KernImpl::apply(x.x, y.x, z.x); + ctype b = KernImpl::apply(x.y, y.y, z.y); + ctype g = KernImpl::apply(x.z, y.z, z.z); + ctype r = KernImpl::apply(x.w, y.w, z.w); + *(vect_type*)(&dst[idx]) = VectTypeTrait::make_vector(a, b, g, r); + } +#endif + }; + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float16.cu new file mode 100644 index 00000000..0a513760 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float32.cu new file mode 100644 index 00000000..7db553ea --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int16.cu new file mode 100644 index 00000000..0e60b504 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int32.cu new file mode 100644 index 00000000..40ccff8b --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int8.cu new file mode 100644 index 00000000..c93c0088 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_uint8.cu new file mode 100644 index 00000000..37fbdd33 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_GRAD_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ABS_dt_float16.cu new file mode 100644 index 00000000..b9a9f047 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ABS_dt_float32.cu new file mode 100644 index 00000000..85be9fd7 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/ABS_dt_int16.cu new file mode 100644 index 00000000..a0eae25f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/ABS_dt_int32.cu new file mode 100644 index 00000000..460e9e1c --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/ABS_dt_int8.cu new file mode 100644 index 00000000..0d3027db --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ABS_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/ABS_dt_uint8.cu new file mode 100644 index 00000000..ed0f31e0 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ABS_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ABS_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ACOS_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ACOS_dt_float16.cu new file mode 100644 index 00000000..6d818855 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ACOS_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ACOS_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ACOS_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ACOS_dt_float32.cu new file mode 100644 index 00000000..ac4e6680 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ACOS_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ACOS_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ADD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ADD_dt_float16.cu new file mode 100644 index 00000000..e87dc72a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ADD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ADD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ADD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ADD_dt_float32.cu new file mode 100644 index 00000000..90754ef2 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ADD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ADD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ADD_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/ADD_dt_int16.cu new file mode 100644 index 00000000..3e45b924 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ADD_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ADD_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ADD_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/ADD_dt_int32.cu new file mode 100644 index 00000000..1f5ac62a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ADD_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ADD_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ADD_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/ADD_dt_int8.cu new file mode 100644 index 00000000..eb938d44 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ADD_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ADD_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ADD_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/ADD_dt_uint8.cu new file mode 100644 index 00000000..5b212f07 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ADD_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ADD_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ASIN_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ASIN_dt_float16.cu new file mode 100644 index 00000000..138560c8 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ASIN_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ASIN_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ASIN_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ASIN_dt_float32.cu new file mode 100644 index 00000000..3eadd37a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ASIN_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ASIN_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float16.cu new file mode 100644 index 00000000..12cda851 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float32.cu new file mode 100644 index 00000000..df5056b3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ATAN2_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/CEIL_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/CEIL_dt_float16.cu new file mode 100644 index 00000000..ae7e31e7 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/CEIL_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/CEIL_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/CEIL_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/CEIL_dt_float32.cu new file mode 100644 index 00000000..b5230797 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/CEIL_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/CEIL_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cu new file mode 100644 index 00000000..a751ce77 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cu new file mode 100644 index 00000000..d07c0efd --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cu new file mode 100644 index 00000000..8033dd16 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cu new file mode 100644 index 00000000..4fc2812b --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cu new file mode 100644 index 00000000..c60fec87 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cu new file mode 100644 index 00000000..dd8a35f5 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/COS_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/COS_dt_float16.cu new file mode 100644 index 00000000..72f5d191 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COS_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COS_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/COS_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/COS_dt_float32.cu new file mode 100644 index 00000000..4fba9c4f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/COS_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/COS_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EQ_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/EQ_dt_float16.cu new file mode 100644 index 00000000..80937f98 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EQ_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EQ_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/EQ_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/EQ_dt_float32.cu new file mode 100644 index 00000000..63420f99 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EQ_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EQ_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EQ_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/EQ_dt_int16.cu new file mode 100644 index 00000000..b4c60ed6 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EQ_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EQ_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EQ_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/EQ_dt_int32.cu new file mode 100644 index 00000000..d8bc0868 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EQ_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EQ_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EQ_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/EQ_dt_int8.cu new file mode 100644 index 00000000..d73fad6f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EQ_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EQ_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EQ_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/EQ_dt_uint8.cu new file mode 100644 index 00000000..22787aa4 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EQ_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EQ_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float16.cu new file mode 100644 index 00000000..30084e2a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float32.cu new file mode 100644 index 00000000..a62d4d45 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERFCINV_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ERFC_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ERFC_dt_float16.cu new file mode 100644 index 00000000..6c8caf11 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERFC_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERFC_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ERFC_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ERFC_dt_float32.cu new file mode 100644 index 00000000..a528f35e --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERFC_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERFC_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float16.cu new file mode 100644 index 00000000..31b8d032 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFINV, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float32.cu new file mode 100644 index 00000000..63d2fe41 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERFINV_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFINV, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ERF_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ERF_dt_float16.cu new file mode 100644 index 00000000..742a8d66 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERF_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERF_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERF, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ERF_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ERF_dt_float32.cu new file mode 100644 index 00000000..0ca29e2f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ERF_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ERF_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERF, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float16.cu new file mode 100644 index 00000000..ee59a5c2 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXPM1, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float32.cu new file mode 100644 index 00000000..9873b079 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EXPM1_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXPM1, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/EXP_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/EXP_dt_float16.cu new file mode 100644 index 00000000..1fa881f0 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EXP_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EXP_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXP, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/EXP_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/EXP_dt_float32.cu new file mode 100644 index 00000000..aef25c33 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/EXP_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/EXP_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXP, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cu new file mode 100644 index 00000000..e954efae --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cu new file mode 100644 index 00000000..47fcea33 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float16.cu new file mode 100644 index 00000000..9fa36c1a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float32.cu new file mode 100644 index 00000000..427cd5e3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FAST_TANH_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float16.cu new file mode 100644 index 00000000..1757c4b4 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float32.cu new file mode 100644 index 00000000..ca911b11 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int16.cu new file mode 100644 index 00000000..db29f5c1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int32.cu new file mode 100644 index 00000000..57551f97 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int8.cu new file mode 100644 index 00000000..c2751717 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_uint8.cu new file mode 100644 index 00000000..bfbafcca --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_DIV_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float16.cu new file mode 100644 index 00000000..fba0dc9c --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float32.cu new file mode 100644 index 00000000..9b7a85fd --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FLOOR_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cu new file mode 100644 index 00000000..8fc78520 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cu new file mode 100644 index 00000000..f9181c03 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cu new file mode 100644 index 00000000..b3b0f2f3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cu new file mode 100644 index 00000000..cbd42436 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cu new file mode 100644 index 00000000..5847a6d4 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cu new file mode 100644 index 00000000..7dfe0d66 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cu new file mode 100644 index 00000000..4b2692a2 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cu new file mode 100644 index 00000000..8e7dbc85 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cu new file mode 100644 index 00000000..3947bb93 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cu new file mode 100644 index 00000000..71f1f955 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cu new file mode 100644 index 00000000..490654b5 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cu new file mode 100644 index 00000000..6d4b9fa1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cu new file mode 100644 index 00000000..33a35082 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cu new file mode 100644 index 00000000..3d862e3d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cu new file mode 100644 index 00000000..21b37e9a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cu new file mode 100644 index 00000000..545f094b --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float16.cu new file mode 100644 index 00000000..e1cbea2f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float32.cu new file mode 100644 index 00000000..0bb014e0 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/H_SWISH_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LEQ_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_float16.cu new file mode 100644 index 00000000..937fd9ad --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LEQ_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/LEQ_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_float32.cu new file mode 100644 index 00000000..10ea3610 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LEQ_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int16.cu new file mode 100644 index 00000000..a716a2f5 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LEQ_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int32.cu new file mode 100644 index 00000000..b4060e54 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LEQ_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int8.cu new file mode 100644 index 00000000..f7214e71 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LEQ_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LEQ_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_uint8.cu new file mode 100644 index 00000000..3e4f3c6c --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LEQ_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LEQ_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float16.cu new file mode 100644 index 00000000..2b899c34 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG1P, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float32.cu new file mode 100644 index 00000000..68daa6c9 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LOG1P_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG1P, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cu new file mode 100644 index 00000000..85901882 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG_SUM_EXP, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cu new file mode 100644 index 00000000..d350490d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG_SUM_EXP, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LOG_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/LOG_dt_float16.cu new file mode 100644 index 00000000..9645202c --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LOG_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LOG_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/LOG_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/LOG_dt_float32.cu new file mode 100644 index 00000000..17f1d36d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LOG_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LOG_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LT_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/LT_dt_float16.cu new file mode 100644 index 00000000..8f84ab33 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LT_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LT_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/LT_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/LT_dt_float32.cu new file mode 100644 index 00000000..84da58f5 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LT_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LT_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LT_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/LT_dt_int16.cu new file mode 100644 index 00000000..036b5884 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LT_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LT_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LT_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/LT_dt_int32.cu new file mode 100644 index 00000000..5e82e872 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LT_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LT_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LT_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/LT_dt_int8.cu new file mode 100644 index 00000000..92313c56 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LT_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LT_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/LT_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/LT_dt_uint8.cu new file mode 100644 index 00000000..25e7066d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/LT_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/LT_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MAX_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/MAX_dt_float16.cu new file mode 100644 index 00000000..02f5aacd --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MAX_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MAX_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/MAX_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/MAX_dt_float32.cu new file mode 100644 index 00000000..c9d81602 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MAX_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MAX_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MAX_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/MAX_dt_int16.cu new file mode 100644 index 00000000..de1de5fe --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MAX_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MAX_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MAX_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/MAX_dt_int32.cu new file mode 100644 index 00000000..02654c53 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MAX_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MAX_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MAX_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/MAX_dt_int8.cu new file mode 100644 index 00000000..7387197b --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MAX_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MAX_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MAX_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/MAX_dt_uint8.cu new file mode 100644 index 00000000..2c06557e --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MAX_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MAX_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MIN_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/MIN_dt_float16.cu new file mode 100644 index 00000000..77580cd3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MIN_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MIN_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/MIN_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/MIN_dt_float32.cu new file mode 100644 index 00000000..3ef78a5e --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MIN_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MIN_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MIN_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/MIN_dt_int16.cu new file mode 100644 index 00000000..4b2f1e8a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MIN_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MIN_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MIN_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/MIN_dt_int32.cu new file mode 100644 index 00000000..e253b54e --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MIN_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MIN_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MIN_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/MIN_dt_int8.cu new file mode 100644 index 00000000..c94fe5a2 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MIN_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MIN_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MIN_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/MIN_dt_uint8.cu new file mode 100644 index 00000000..047ca3f3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MIN_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MIN_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MOD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/MOD_dt_float16.cu new file mode 100644 index 00000000..8c0e1e86 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MOD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MOD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/MOD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/MOD_dt_float32.cu new file mode 100644 index 00000000..a18c33ef --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MOD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MOD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MOD_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/MOD_dt_int16.cu new file mode 100644 index 00000000..1ca2d0b3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MOD_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MOD_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MOD_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/MOD_dt_int32.cu new file mode 100644 index 00000000..27adc7de --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MOD_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MOD_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MOD_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/MOD_dt_int8.cu new file mode 100644 index 00000000..67af99ed --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MOD_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MOD_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MOD_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/MOD_dt_uint8.cu new file mode 100644 index 00000000..5c2239f5 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MOD_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MOD_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MUL_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/MUL_dt_float16.cu new file mode 100644 index 00000000..fa5c045d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MUL_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MUL_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/MUL_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/MUL_dt_float32.cu new file mode 100644 index 00000000..1221c930 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MUL_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MUL_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MUL_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/MUL_dt_int16.cu new file mode 100644 index 00000000..ed8d087d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MUL_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MUL_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MUL_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/MUL_dt_int32.cu new file mode 100644 index 00000000..d134cbc1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MUL_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MUL_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MUL_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/MUL_dt_int8.cu new file mode 100644 index 00000000..51a940dc --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MUL_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MUL_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/MUL_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/MUL_dt_uint8.cu new file mode 100644 index 00000000..869ac8d1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/MUL_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/MUL_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float16.cu new file mode 100644 index 00000000..37bbef68 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float32.cu new file mode 100644 index 00000000..67450e9c --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/NEGATE_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int16.cu new file mode 100644 index 00000000..9fad09df --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int32.cu new file mode 100644 index 00000000..2b050a96 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int8.cu new file mode 100644 index 00000000..a2a4fab8 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/NEGATE_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_uint8.cu new file mode 100644 index 00000000..e5a7c179 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/NEGATE_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/NEGATE_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/POW_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/POW_dt_float16.cu new file mode 100644 index 00000000..649056ec --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/POW_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/POW_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(POW, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/POW_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/POW_dt_float32.cu new file mode 100644 index 00000000..961963cd --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/POW_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/POW_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(POW, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RELU_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/RELU_dt_float16.cu new file mode 100644 index 00000000..03ae007b --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RELU_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RELU_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/RELU_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/RELU_dt_float32.cu new file mode 100644 index 00000000..dd51d693 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RELU_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RELU_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RELU_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/RELU_dt_int16.cu new file mode 100644 index 00000000..16108bb4 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RELU_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RELU_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RELU_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/RELU_dt_int32.cu new file mode 100644 index 00000000..6d8c6515 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RELU_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RELU_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RELU_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/RELU_dt_int8.cu new file mode 100644 index 00000000..755fe67a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RELU_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RELU_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RELU_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/RELU_dt_uint8.cu new file mode 100644 index 00000000..f3a99f6d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RELU_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RELU_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int16.cu new file mode 100644 index 00000000..843836c3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RMULH_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int32.cu new file mode 100644 index 00000000..a8c791b8 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RMULH_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int8.cu new file mode 100644 index 00000000..7cedcd83 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RMULH_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/RMULH_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_uint8.cu new file mode 100644 index 00000000..b7962150 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/RMULH_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/RMULH_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/ROUND_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/ROUND_dt_float16.cu new file mode 100644 index 00000000..bdd87ff9 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ROUND_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ROUND_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/ROUND_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/ROUND_dt_float32.cu new file mode 100644 index 00000000..06db7e4d --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/ROUND_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/ROUND_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHL_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/SHL_dt_int16.cu new file mode 100644 index 00000000..53b5d392 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHL_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHL_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHL_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/SHL_dt_int32.cu new file mode 100644 index 00000000..71f570c7 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHL_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHL_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHL_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/SHL_dt_int8.cu new file mode 100644 index 00000000..6b4d862f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHL_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHL_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHL_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/SHL_dt_uint8.cu new file mode 100644 index 00000000..46124c93 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHL_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHL_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHR_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/SHR_dt_int16.cu new file mode 100644 index 00000000..e7a2a173 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHR_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHR_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHR_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/SHR_dt_int32.cu new file mode 100644 index 00000000..096f339f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHR_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHR_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHR_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/SHR_dt_int8.cu new file mode 100644 index 00000000..d968d8ae --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHR_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHR_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SHR_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/SHR_dt_uint8.cu new file mode 100644 index 00000000..700fbf44 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SHR_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SHR_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cu new file mode 100644 index 00000000..c552b8bd --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cu new file mode 100644 index 00000000..fd94dbf0 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cu new file mode 100644 index 00000000..b310f877 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cu new file mode 100644 index 00000000..a961fbf1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cu new file mode 100644 index 00000000..cda67cde --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cu new file mode 100644 index 00000000..fa731e7c --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float16.cu new file mode 100644 index 00000000..6157b102 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float32.cu new file mode 100644 index 00000000..677d3a8e --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIGMOID_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SIN_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/SIN_dt_float16.cu new file mode 100644 index 00000000..27fe547a --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIN_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIN_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIN, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/SIN_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/SIN_dt_float32.cu new file mode 100644 index 00000000..28e9db2f --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SIN_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SIN_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIN, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SUB_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/SUB_dt_float16.cu new file mode 100644 index 00000000..e95cde06 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SUB_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SUB_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/SUB_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/SUB_dt_float32.cu new file mode 100644 index 00000000..a3f824b3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SUB_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SUB_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SUB_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/SUB_dt_int16.cu new file mode 100644 index 00000000..29d104a1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SUB_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SUB_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SUB_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/SUB_dt_int32.cu new file mode 100644 index 00000000..d7a2d0fd --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SUB_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SUB_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SUB_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/SUB_dt_int8.cu new file mode 100644 index 00000000..cc66a40e --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SUB_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SUB_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SUB_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/SUB_dt_uint8.cu new file mode 100644 index 00000000..deffafc1 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SUB_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SUB_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float16.cu new file mode 100644 index 00000000..07255631 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float32.cu new file mode 100644 index 00000000..01ed2df7 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int16.cu new file mode 100644 index 00000000..b564e4af --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int32.cu new file mode 100644 index 00000000..4521ee09 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int8.cu new file mode 100644 index 00000000..b59446a2 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_uint8.cu new file mode 100644 index 00000000..db410c49 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/SWITCH_GT0_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float16.cu new file mode 100644 index 00000000..ce454599 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float32.cu new file mode 100644 index 00000000..433b55de --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int16.cu b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int16.cu new file mode 100644 index 00000000..2697ebf0 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int16.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int16 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int32.cu b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int32.cu new file mode 100644 index 00000000..f9544794 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int8.cu b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int8.cu new file mode 100644 index 00000000..c655aaa0 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_int8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_uint8.cu b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_uint8.cu new file mode 100644 index 00000000..ab2036e3 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_uint8.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_GRAD_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_uint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/TANH_dt_float16.cu new file mode 100644 index 00000000..683f4883 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/TANH_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/TANH_dt_float32.cu new file mode 100644 index 00000000..00c542c6 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TANH_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TANH_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float16.cu b/dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float16.cu new file mode 100644 index 00000000..ed382689 --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float16.cu @@ -0,0 +1,17 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TRUE_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float16 +#include "../kern_impl.inl" +#endif diff --git a/dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float32.cu b/dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float32.cu new file mode 100644 index 00000000..7441a0ae --- /dev/null +++ b/dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float32.cu @@ -0,0 +1,15 @@ +/** + * \file dnn/src/cuda/elemwise/kimpl/TRUE_DIV_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TRUE_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_CTYPE dt_float32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise/opr_impl.cpp b/dnn/src/cuda/elemwise/opr_impl.cpp new file mode 100644 index 00000000..2d927dca --- /dev/null +++ b/dnn/src/cuda/elemwise/opr_impl.cpp @@ -0,0 +1,72 @@ +/** + * \file dnn/src/cuda/elemwise/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./kern_wrapper.cuh" +#include "./special_kerns.cuh" + +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +#define on_arity_dispatched_cb_dtype(_dt) \ + if (m_dst->layout.dtype == _dt()) { \ + using dtrait = DTypeTrait<_dt>; \ + using ctype = dtrait::ctype; \ + auto stream = cuda_stream(handle()); \ + return ModeDispatcher::run( \ + src, stream, m_param.mode, m_dst->ptr()); \ + } + +#define _cb_dispatch_mode(_m) case Mode::_m: do { \ + using KernImpl = ElemwiseKern< \ + megcorePlatformCUDA, param_enumv::Elemwise::Mode::_m, ctype>; \ + using Wrapper = ElemArithKernWrapper; \ + Wrapper wrapper; \ + wrapper.dst = static_cast(dst); \ + return run_elemwise(src, stream, wrapper); \ +} while(0); + +#define IMPL_MODE_DISPATCHER(_arity, _dtype_cat) \ +template \ +struct ElemwiseForwardImpl::ModeDispatcher<_arity, _dtype_cat, ctype> { \ + static constexpr int arity = _arity; \ + static void run(const ElemwiseOpParamN &src, \ + cudaStream_t stream, Mode mode, void *dst) { \ + switch (mode) { \ + FOREACH(_cb_dispatch_mode) \ + default: \ + megdnn_throw("bad mode"); \ + } \ + } \ +} + +#include "src/common/elemwise/opr_impl_body.inl" + +template +void ElemwiseForwardImpl::impl_fuse_mul_add3( + const ElemwiseOpParamN<3> ¶m) { + kern_fuse_mul_add3( + m_dst->ptr(), param, cuda_stream(handle())); +} + +template +void ElemwiseForwardImpl::impl_fuse_mul_add4( + const ElemwiseOpParamN<4> ¶m) { + kern_fuse_mul_add4(m_dst->ptr(), param, cuda_stream(handle())); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/elemwise/opr_impl.h b/dnn/src/cuda/elemwise/opr_impl.h new file mode 100644 index 00000000..4c25a0ae --- /dev/null +++ b/dnn/src/cuda/elemwise/opr_impl.h @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/elemwise/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/common/elemwise/opr_impl_helper.h" + +namespace megdnn { +namespace cuda { + + class ElemwiseForwardImpl final: public ElemwiseForwardImplHelper { +#include "src/common/elemwise/opr_impl_class_def.inl" + }; + +} +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/elemwise/special_kerns.cuh b/dnn/src/cuda/elemwise/special_kerns.cuh new file mode 100644 index 00000000..44b9cb92 --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kerns.cuh @@ -0,0 +1,31 @@ +/** + * \file dnn/src/cuda/elemwise/special_kerns.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/elemwise_helper.cuh" + +namespace megdnn { +namespace cuda { + + template + void kern_fuse_mul_add3(ctype *dest, + const ElemwiseOpParamN<3> ¶m, cudaStream_t stream); + + template + void kern_fuse_mul_add4(ctype *dest, + const ElemwiseOpParamN<4> ¶m, cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/elemwise/special_kerns.inl b/dnn/src/cuda/elemwise/special_kerns.inl new file mode 100644 index 00000000..9b3cf59b --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kerns.inl @@ -0,0 +1,252 @@ +/** + * \file dnn/src/cuda/elemwise/special_kerns.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./special_kerns.cuh" + +namespace megdnn { +namespace cuda { +namespace elemwise_intl { + + template + struct FuseMulAdd3Op { + typedef ctype* __restrict bufptr_t; + bufptr_t m_dst, m_src2; + + __device__ __forceinline__ void operator()(uint32_t idx, int off0, + int /* off1 */, ctype x, + ctype y) { + m_dst[idx] = x * y + m_src2[c_is_scalar ? 0 : off0]; + } + }; + + template + struct FuseMulAdd3Op::value || + std::is_same::value>::type> { + typedef ctype* __restrict bufptr_t; + typedef typename VectTypeTrait::vect_type vect_type; + bufptr_t m_dst, m_src2; + __device__ __forceinline__ void operator()(uint32_t idx, int off0, int, + ctype x, ctype y) { + m_dst[idx] = x * y + m_src2[0]; + } + __device__ __forceinline__ void operator()(int32_t idx, int off0, int, + vect_type x, vect_type y) { + ctype a = x.x * y.x + m_src2[0]; + ctype b = x.y * y.y + m_src2[0]; + ctype g = x.z * y.z + m_src2[0]; + ctype r = x.w * y.w + m_src2[0]; + *(vect_type*)(&m_dst[idx]) = + VectTypeTrait::make_vector(a, b, g, r); + } + }; + + template + struct FuseMulAdd3Op::value || + std::is_same::value>::type> { + typedef ctype* __restrict bufptr_t; + typedef typename VectTypeTrait::vect_type vect_type; + bufptr_t m_dst, m_src2; + __device__ __forceinline__ void operator()(uint32_t idx, int off0, int, + ctype x, ctype y) { + m_dst[idx] = x * y + m_src2[off0]; + } + __device__ __forceinline__ void operator()(int32_t idx, int off0, int, + vect_type x, vect_type y) { + vect_type z = *(vect_type*)(&m_src2[off0]); + ctype a = x.x * y.x + z.x; + ctype b = x.y * y.y + z.y; + ctype g = x.z * y.z + z.z; + ctype r = x.w * y.w + z.w; + *(vect_type*)(&m_dst[idx]) = + VectTypeTrait::make_vector(a, b, g, r); + } + }; + + template + struct FuseMulAdd4Op { + typedef ctype* __restrict bufptr_t; + bufptr_t m_dst, m_src2, m_src3; + + __device__ __forceinline__ void operator()(uint32_t idx, int off0, int off1, + ctype src0, ctype src1) { + m_dst[idx] = src0 * src1 + m_src2[off0] * m_src3[off1]; + } + }; + + template + struct FuseMulAdd4Op::value || + std::is_same::value>::type> { + typedef ctype* __restrict bufptr_t; + typedef typename VectTypeTrait::vect_type vect_type; + bufptr_t m_dst, m_src2, m_src3; + __device__ __forceinline__ void operator()(uint32_t idx, int off0, + int off1, ctype x, ctype y) { + m_dst[idx] = x * y + m_src2[off0] * m_src3[off1]; + } + __device__ __forceinline__ void operator()(uint32_t idx, int off0, + int off1, vect_type x, + vect_type y) { + vect_type z = *(vect_type*)(&m_src2[off0]); + vect_type w = *(vect_type*)(&m_src3[off1]); + ctype a = x.x * y.x + z.x * w.x; + ctype b = x.y * y.y + z.y * w.y; + ctype g = x.z * y.z + z.z * w.z; + ctype r = x.w * y.w + z.w * w.w; + *(vect_type*)(&m_dst[idx]) = + VectTypeTrait::make_vector(a, b, g, r); + } + }; + + //! wrap an op so the special OpCaller can be selected by template matching + template + class FuseOpWrapper { + const Op& m_op; + + public: + FuseOpWrapper(const Op& op) : m_op(op) {} + + operator const Op&() const { return m_op; } + }; + + template + struct OpCallerBinary, PVis0, PVis1> { + Op op; + PVis0 par0; + PVis1 par1; + MEGDNN_STATIC_ASSERT(PVis0::packed_size == PVis1::packed_size, + "vector size mismatch"); + static const uint32_t packed_size = PVis0::packed_size; + + __device__ __forceinline__ void thread_init(uint32_t idx) { + idx = idx * packed_size; + par0.thread_init(idx); + par1.thread_init(idx); + } + + __device__ __forceinline__ void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par0.offset(idx), par1.offset(idx), par0.at(idx), + par1.at(idx)); + } + + __device__ __forceinline__ void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par0.offset(idx), par1.offset(idx), par0.at(idx), + par1.at(idx)); + } else { + auto ptr0 = par0.ptr(); + auto ptr1 = par1.ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, par0.offset(idx + i), par1.offset(idx + i), + ptr0[par0.offset(idx + i)], ptr1[par1.offset(idx + i)]); + } + } + } + + __device__ __forceinline__ void next() { + par0.next(); + par1.next(); + } + }; + + template + struct OpCallerUniform, 2, PVis> { + Op op; + PVis par[2]; + static const uint32_t packed_size = PVis::packed_size; + + __device__ __forceinline__ void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + par[1].thread_init(idx); + } + + __device__ __forceinline__ void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].offset(idx), par[1].offset(idx), par[0].at(idx), + par[1].at(idx)); + } + + __device__ __forceinline__ void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].offset(idx), par[1].offset(idx), par[0].at(idx), + par[1].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + auto ptr1 = par[1].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, par[0].offset(idx + i), par[1].offset(idx + i), + ptr0[par[0].offset(idx + i)], + ptr1[par[1].offset(idx + i)]); + } + } + } + + __device__ __forceinline__ void next() { + par[0].next(); + par[1].next(); + } + }; + +} // namespace elemwise_intl + +namespace { + template + void run_fuse_elemwise(Op& op, const ElemwiseOpParamN& param, + cudaStream_t stream) { + param.assert_initialized(); + ElemwiseOpParamN<2> p2 = *static_cast*>( + static_cast(¶m)); + elemwise_intl::UserOpInvoker, ctype, 2>( + p2, stream, op); + } +} // anonymous namespace + + template + void kern_fuse_mul_add3(ctype* dest, const ElemwiseOpParamN<3>& param, + cudaStream_t stream) { + elemwise_intl::FuseMulAdd3Op op; + op.m_dst = dest; + op.m_src2 = param[2].ptr(); + run_fuse_elemwise(op, param, stream); + } + + template + void kern_fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param, + cudaStream_t stream) { + elemwise_intl::FuseMulAdd4Op op; + op.m_dst = dest; + op.m_src2 = param[2].ptr(); + op.m_src3 = param[3].ptr(); + run_fuse_elemwise(op, param, stream); + } + +#define INST(_dt) \ + template void kern_fuse_mul_add3(DTypeTrait<_dt>::ctype*, \ + const ElemwiseOpParamN<3>&, \ + cudaStream_t); \ + template void kern_fuse_mul_add3(DTypeTrait<_dt>::ctype*, \ + const ElemwiseOpParamN<3>&, \ + cudaStream_t); \ + template void kern_fuse_mul_add4(DTypeTrait<_dt>::ctype*, \ + const ElemwiseOpParamN<4>&, \ + cudaStream_t); + +// vim: ft=cuda syntax=cpp.doxygen + diff --git a/dnn/src/cuda/elemwise/special_kimpl/special_dt_float16.cu b/dnn/src/cuda/elemwise/special_kimpl/special_dt_float16.cu new file mode 100644 index 00000000..2857f61b --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kimpl/special_dt_float16.cu @@ -0,0 +1,18 @@ +/** + * \file dnn/src/cuda/elemwise/special_kimpl/special_dt_float16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_special_kern_impls.py +#if !MEGDNN_DISABLE_FLOAT16 +#include "../special_kerns.inl" +INST(::megdnn::dtype::Float16) +#undef INST +} +} +#endif diff --git a/dnn/src/cuda/elemwise/special_kimpl/special_dt_float32.cu b/dnn/src/cuda/elemwise/special_kimpl/special_dt_float32.cu new file mode 100644 index 00000000..fb929f03 --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kimpl/special_dt_float32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise/special_kimpl/special_dt_float32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_special_kern_impls.py +#include "../special_kerns.inl" +INST(::megdnn::dtype::Float32) +#undef INST +} +} diff --git a/dnn/src/cuda/elemwise/special_kimpl/special_dt_int16.cu b/dnn/src/cuda/elemwise/special_kimpl/special_dt_int16.cu new file mode 100644 index 00000000..b16743de --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kimpl/special_dt_int16.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise/special_kimpl/special_dt_int16.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_special_kern_impls.py +#include "../special_kerns.inl" +INST(::megdnn::dtype::Int16) +#undef INST +} +} diff --git a/dnn/src/cuda/elemwise/special_kimpl/special_dt_int32.cu b/dnn/src/cuda/elemwise/special_kimpl/special_dt_int32.cu new file mode 100644 index 00000000..74bf726b --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kimpl/special_dt_int32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise/special_kimpl/special_dt_int32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_special_kern_impls.py +#include "../special_kerns.inl" +INST(::megdnn::dtype::Int32) +#undef INST +} +} diff --git a/dnn/src/cuda/elemwise/special_kimpl/special_dt_int8.cu b/dnn/src/cuda/elemwise/special_kimpl/special_dt_int8.cu new file mode 100644 index 00000000..fafb0923 --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kimpl/special_dt_int8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise/special_kimpl/special_dt_int8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_special_kern_impls.py +#include "../special_kerns.inl" +INST(::megdnn::dtype::Int8) +#undef INST +} +} diff --git a/dnn/src/cuda/elemwise/special_kimpl/special_dt_uint8.cu b/dnn/src/cuda/elemwise/special_kimpl/special_dt_uint8.cu new file mode 100644 index 00000000..00c83190 --- /dev/null +++ b/dnn/src/cuda/elemwise/special_kimpl/special_dt_uint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise/special_kimpl/special_dt_uint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_special_kern_impls.py +#include "../special_kerns.inl" +INST(::megdnn::dtype::Uint8) +#undef INST +} +} diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp new file mode 100644 index 00000000..15791f6a --- /dev/null +++ b/dnn/src/cuda/elemwise_helper.cpp @@ -0,0 +1,209 @@ +/** + * \file dnn/src/cuda/elemwise_helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/elemwise_helper.cuh" +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.h" + +#include "src/common/utils.h" + +#include +#include +#include + +#define _cb_check_ndim(n) megdnn::TensorShape::MAX_NDIM == n || +static_assert(MEGDNN_FOREACH_TENSOR_NDIM(_cb_check_ndim) false, + "bad foreach ndim"); +#undef _cb_check_ndim + +namespace megdnn { +namespace cuda { + +// ParamElemVisitor::init impls +namespace elemwise_intl { + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +template +void ParamElemVisitor::host_init( + const TensorND &rv, int /*grid_size*/, int /*block_size*/) { + megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); + m_ptr = rv.ptr(); + for (size_t i = 0; i < rv.layout.ndim; ++ i) { + m_stride[i] = rv.layout.stride[i]; + if (i + 1 < rv.layout.ndim) + m_shape_highdim[i] = rv.layout.shape[i + 1]; + } + for (int i = rv.layout.ndim - 1; i < ndim - 1; ++ i) { + m_shape_highdim[i] = 1; + } + for (int i = rv.layout.ndim; i < ndim; ++ i) { + m_stride[i] = 0; + } +} +#pragma GCC diagnostic pop + +template +void ParamElemVisitor<3, ctype, BCAST_101>::host_init( + const TensorND& rv, int grid_size, int block_size) { + uint32_t shape2, shape1; + int stride1; + if (rv.layout.ndim == 3) { + megdnn_assert(!rv.layout.stride[0] && !rv.layout.stride[2]); + shape1 = rv.layout[1]; + shape2 = rv.layout[2]; + stride1 = rv.layout.stride[1]; + } else { + megdnn_assert(rv.layout.ndim == 2 && !rv.layout.stride[1]); + shape1 = rv.layout[0]; + shape2 = rv.layout[1]; + stride1 = rv.layout.stride[0]; + } + m_ptr = rv.ptr(); + m_stride1 = stride1; + m_shape12.host_init(packed_size * grid_size * block_size, shape2, shape1); +} + +template +void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv, + int grid_size, + int block_size) { + megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]); + m_ptr = rv.ptr(); + m_stride1 = rv.layout.stride[1]; + m_shape1.host_init(packed_size * grid_size * block_size, + rv.layout.shape[1]); +} + +template +void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv, + int grid_size, + int block_size) { + megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[1]); + m_ptr = rv.ptr(); + m_stride0 = rv.layout.stride[0]; + m_shape1.host_init(packed_size * grid_size * block_size, + rv.layout.shape[1]); +} + +template +void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init( + const TensorND &rv, int /*grid_size*/, int /*block_size*/) { + megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]); + m_ptr = rv.ptr(); +} + +template +void ParamVectVisitor<4, ctype, BCAST_1010>::host_init(const TensorND& rv, + int grid_size, + int block_size) { + megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0] && + !rv.layout.stride[2]); + m_ptr = rv.ptr(); + m_stride1 = rv.layout.stride[1]; + m_stride3 = rv.layout.stride[3]; + uint32_t shape1 = rv.layout.shape[1]; + uint32_t shape2 = rv.layout.shape[2]; + uint32_t shape3 = rv.layout.shape[3]; + m_shape123.host_init(packed_size * grid_size * block_size, shape2 * shape3, + shape1); + m_shape3.host_init(packed_size * grid_size * block_size, shape3); +} + +#define INST(ndim, ctype, brd) template class ParamElemVisitor +#define INST_FOR_CTYPE \ + MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) \ + INST(3, ct, BCAST_101); \ + INST(2, ct, BCAST_10); \ + INST(2, ct, BCAST_01); \ + INST(1, ct, BCAST_FULL); + + +#define ndim_cb(_ndim) INST(_ndim, ct, BCAST_OTHER); + +#define ct dt_byte +INST_FOR_CTYPE +#undef ct +#define ct dt_int32 +INST_FOR_CTYPE +#undef ct +#define ct dt_float32 +INST_FOR_CTYPE +#undef ct +#define ct dt_float16 +INST_FOR_CTYPE +#undef ct +#define ct dt_int8 +INST_FOR_CTYPE +#undef ct +#define ct dt_uint8 +INST_FOR_CTYPE +#undef ct +#define ct dt_int16 +INST_FOR_CTYPE +#undef ct +#define ct dt_quint8 +INST_FOR_CTYPE +#undef ct +#define ct dt_qint8 +INST_FOR_CTYPE +#undef ct +#define ct dt_qint32 +INST_FOR_CTYPE +#undef ct + +#undef ndim_cb + +#undef INST_FOR_CTYPE +#undef INST + +#define INST(dt_ibyte) template class ParamVectVisitor<4, dt_ibyte, BCAST_1010> +INST(dt_int8); +INST(dt_uint8); +INST(dt_qint8); +INST(dt_quint8); +#undef dt_ibyte + +} // namespace elemwise_intl + + +void elemwise_intl::get_launch_spec( + const void *kern, size_t size, int *grid_size, int *block_size) { + safe_size_in_kern(size); + auto config = query_launch_config_for_kernel(kern); + *block_size = config.block_size; + int a = size / (config.block_size * 2), + b = (size - 1) / (config.block_size * 3) + 1; + if (current_device_prop().major <= 3) { + // for Kepler, less blocks (more work per thread) is faster + *grid_size = b; + } else { + *grid_size = std::max(a, b); + } + if (!*grid_size) { + *block_size = std::min(std::max(size / 64, 1) * 32, 1024); + *grid_size = std::max(size / *block_size, 1); + } + // because we unroll 3 times in the kernel + megdnn_assert(static_cast(*block_size) * *grid_size * 3 >= size); +} + +void elemwise_intl::on_bad_ndim(int ndim) { + megdnn_throw(ssprintf("invalid ndim: %d", ndim)); + MEGDNN_MARK_USED_VAR(ndim); + +} +} // namespace cuda +} // namespace megdnn + + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} + diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh new file mode 100644 index 00000000..14bf22a9 --- /dev/null +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -0,0 +1,1250 @@ +/** + * \file dnn/src/cuda/elemwise_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/common/elemwise_helper.cuh" +#include "src/cuda/utils.cuh" +#include "src/cuda/int_fastdiv.cuh" +#include "src/cuda/query_blocksize.cuh" + +/* + * please note that all arithmetics on GPU are 32-bit for best performance; this + * limits max possible size + */ + +namespace megdnn { +namespace cuda { + +//! internals for element-wise +namespace elemwise_intl { +#define devfunc __device__ __forceinline__ + + /*! + * \brief get cuda launch specs for element-wise kernel + * \param kern kernel function address + * \param size total size of elements + */ + void get_launch_spec( + const void *kern, size_t size, int *grid_size, int *block_size); + + MEGDNN_NORETURN void on_bad_ndim(int ndim); + + /*! + * \brief broadcast type + * BCAST_x[0]x[1]...: x[i] == !stride[i] + */ + enum BcastType { + BCAST_OTHER, + BCAST_1010, + BCAST_101, + BCAST_10, + BCAST_01, + BCAST_FULL + }; + + /*! + * \brief read and write type trait for byte width integer type + */ + template + class VectTypeTrait; + + struct __attribute__((aligned(8))) half4 { + dt_float16 x, y, z, w; + }; + + __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y, + dt_float16 z, dt_float16 w) { + half4 t; + t.x = x, t.y = y, t.z = z, t.w = w; + return t; + } + +#define INST(_ctype, _vect_type) \ + template <> \ + class VectTypeTrait<_ctype> { \ + public: \ + using vect_type = _vect_type; \ + static const size_t packed_size = sizeof(_vect_type) / sizeof(_ctype); \ + static __device__ __forceinline__ vect_type make_vector(_ctype x, \ + _ctype y, \ + _ctype z, \ + _ctype w) { \ + return make_##_vect_type(as_raw(x), as_raw(y), as_raw(z), \ + as_raw(w)); \ + } \ + } +#define as_raw(x) x + INST(dt_int8, char4); + INST(dt_uint8, uchar4); + INST(dt_float32, float4); + INST(dt_float16, half4); + INST(dt_int32, int4); + INST(dt_int16, short4); +#undef as_raw +#define as_raw(x) x.as_int8() + INST(dt_qint8, char4); +#undef as_raw +#define as_raw(x) x.as_uint8() + INST(dt_quint8, uchar4); +#undef as_raw +#define as_raw(x) x.as_int32() + INST(dt_qint32, int4); +#undef as_raw +#undef INST + + /*! + * \brief visitor to access an elemeent in a tensor at given logic index + * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait) + * \tparam brdcast_mask bit mask for broadcast of params; (i.e. stride[i] is + * 0 iff (brdcast_mask & (1<<(ndim-1-i))) is 1. + * + * host interface: + * void host_init( + * const TensorND &tensor, int grid_size, int block_size) + * + * device interface: + * void thread_init(uint32_t idx) + * called on thread entrance, with logical indexing; the index may + * go beyond buffer range + * + * ctype* ptr() + * return buffer pointer; can be used by specialized OpCaller + * + * void next() + * called before moving to next chunk on each thread + * + * int offset(uint32_t idx) + * get physical offset from logical index + * + * ctype& at(uint32_t idx) + * ptr()[offset(idx)] + * + */ + template + class ParamElemVisitor; + + /*! + * \brief visitor to access vector element in a tensor at given logic index + * \tparam ctype same as ParamElemVisitor, vect_type packed vector type of + * element ctype (i.e. vect_type in VectTypeTrait) \tparam brdcast_mask same + * as ParamElemVisitor + * + * + * device interface: + * vect_type& at(uint32_t idx) + * ptr()[offset(idx)] + * + */ + template + class ParamVectVisitor; + + /* f{{{ ParamElemVisitor specializations */ + +#define PARAM_ELEM_VISITOR_COMMON_DEV \ + devfunc ctype* ptr() { return m_ptr; } \ + devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; } +#define PARAM_ELEM_VISITOR_COMMON_HOST static const int packed_size = 1; + + //! specialization for BCAST_OTHER + template + class ParamElemVisitor { + protected: + ctype* __restrict m_ptr; + + private: + int m_stride[ndim]; + + //! m_shape_highdim[i] = original_shape[i + 1] +#ifdef _MSC_VER + Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1]; +#else + Uint32Fastdiv m_shape_highdim[ndim]; +#endif + + public: + static const int NDIM = ndim; + PARAM_ELEM_VISITOR_COMMON_HOST + + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t) {} + + devfunc void next() {} + + devfunc int offset(uint32_t idx) { + int offset = 0; +#pragma unroll + for (int i = ndim - 1; i >= 1; --i) { + Uint32Fastdiv& shp = m_shape_highdim[i - 1]; + uint32_t idx_div = idx / shp; + offset += (idx - idx_div * shp.divisor()) * m_stride[i]; + idx = idx_div; + } + offset += idx * m_stride[0]; + return offset; + } + + PARAM_ELEM_VISITOR_COMMON_DEV +#endif + }; + + /*! + * \brief specialization for ndim == 3 and BCAST_101 + * (for dimshuffle 'x', 0, 'x') + * + * visit: idx / m_shape2 % m_shape1 + */ + template + class ParamElemVisitor<3, ctype, BCAST_101> { + StridedDivSeq2 m_shape12; + int m_stride1; + + protected: + ctype* __restrict m_ptr; + + public: + static const int NDIM = 3; + PARAM_ELEM_VISITOR_COMMON_HOST + + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t idx) { m_shape12.device_init(idx); } + + devfunc void next() { m_shape12.next(); } + + devfunc int offset(uint32_t idx) { return m_shape12.get() * m_stride1; } + + PARAM_ELEM_VISITOR_COMMON_DEV +#endif + }; + + /*! + * \brief specialization for ndim == 2 and BCAST_10 + * + * visit: idx % m_shape1 + */ + template + class ParamElemVisitor<2, ctype, BCAST_10> { + StridedDivSeq m_shape1; + int m_stride1; + + protected: + ctype* __restrict m_ptr; + + public: + static const int NDIM = 2; + PARAM_ELEM_VISITOR_COMMON_HOST + + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); } + + devfunc void next() { m_shape1.next(); } + + devfunc int offset(uint32_t idx) { return m_shape1.r() * m_stride1; } + + PARAM_ELEM_VISITOR_COMMON_DEV +#endif + }; + + /*! + * \brief specialization for ndim == 2 and BCAST_01 + * + * visit: idx / shape1 + */ + template + class ParamElemVisitor<2, ctype, BCAST_01> { + StridedDivSeq m_shape1; + int m_stride0; + + protected: + ctype* __restrict m_ptr; + + public: + static const int NDIM = 2; + PARAM_ELEM_VISITOR_COMMON_HOST + + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); } + + devfunc void next() { m_shape1.next(); } + + devfunc int offset(uint32_t idx) { return m_shape1.q() * m_stride0; } + + PARAM_ELEM_VISITOR_COMMON_DEV +#endif + }; + + //! specialization for ndim == 1 and BCAST_FULL + template + class ParamElemVisitor<1, ctype, BCAST_FULL> { + protected: + ctype* __restrict m_ptr; + + public: + static const int NDIM = 1; + PARAM_ELEM_VISITOR_COMMON_HOST + + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t) {} + + devfunc void next() {} + + devfunc int offset(uint32_t idx) { + MEGDNN_MARK_USED_VAR(idx); + return 0; + } + + PARAM_ELEM_VISITOR_COMMON_DEV +#endif + }; + +#undef PARAM_ELEM_VISITOR_COMMON_DEV +#undef PARAM_ELEM_VISITOR_COMMON_HOST + + /* f}}} */ + + /* f{{{ ParamVectVisitor specializations */ + +#if MEGDNN_CC_CUDA +#define DEVICE_WRAPPER(x) x +#else +#define DEVICE_WRAPPER(x) +#endif +#define INST_PARAM_VECT_VISITOR \ + template \ + class ParamVectVisitor \ + : public ParamElemVisitor { \ + public: \ + using Super = ParamElemVisitor; \ + using rwtype = typename VectTypeTrait::vect_type; \ + static const int packed_size = sizeof(rwtype) / sizeof(ctype); \ + DEVICE_WRAPPER(devfunc rwtype& at(uint32_t idx) { \ + return *(rwtype*)(&Super::m_ptr[Super::offset(idx)]); \ + }) \ + }; +#define _brdcast_mask BCAST_OTHER + INST_PARAM_VECT_VISITOR; +#undef _brdcast_mask +#define _brdcast_mask BCAST_01 + INST_PARAM_VECT_VISITOR; +#undef _brdcast_mask +#define _brdcast_mask BCAST_10 + INST_PARAM_VECT_VISITOR; +#undef _brdcast_mask +#define _brdcast_mask BCAST_101 + INST_PARAM_VECT_VISITOR; +#undef _brdcast_mask +#define INST_DT_IBYTE(ctype) \ + template \ + class ParamVectVisitor \ + : public ParamElemVisitor { \ + public: \ + using Super = ParamElemVisitor; \ + using rwtype = typename VectTypeTrait::vect_type; \ + static const int packed_size = sizeof(rwtype) / sizeof(ctype); \ + DEVICE_WRAPPER(rwtype vect_scalar; \ + devfunc rwtype & at(uint32_t /* idx */) { \ + ctype v = Super::m_ptr[0]; \ + vect_scalar = VectTypeTrait::make_vector( \ + v, v, v, v); \ + return vect_scalar; \ + }) \ + } + INST_DT_IBYTE(dt_int8); + INST_DT_IBYTE(dt_uint8); + INST_DT_IBYTE(dt_qint8); + INST_DT_IBYTE(dt_quint8); +#undef INST_DT_IBYTE +#undef DEVICE_WRAPPER +#undef INST_PARAM_VECT_VISITOR + + /*! + * \brief specialization for ndim == 4 and BCAST_1010 + * + * visit: (idx % m_shape3) * m_stride3 + (idx / m_shape23 % m_shape1) * + * m_stride1 + */ + template + class ParamVectVisitor<4, ctype, BCAST_1010> { + StridedDivSeq2 m_shape123; + StridedDivSeq m_shape3; + int m_stride3, m_stride1; + ctype* __restrict m_ptr; + + public: + static const int NDIM = 4; + using rwtype = typename VectTypeTrait::vect_type; + static const int packed_size = sizeof(rwtype) / sizeof(ctype); + + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t idx) { + m_shape123.device_init(idx); + m_shape3.device_init(idx); + } + + devfunc void next() { + m_shape123.next(); + m_shape3.next(); + } + + devfunc int offset(uint32_t idx) { + return m_shape3.r() * m_stride3 + m_shape123.get() * m_stride1; + } + + devfunc ctype* ptr() { return m_ptr; } + devfunc rwtype& at(uint32_t idx) { + return *(rwtype*)(&m_ptr[offset(idx)]); + } +#endif + }; + + /* f}}} */ + + +#if MEGDNN_CC_CUDA + + /* f{{{ user operator callers */ + + /* + * OpCaller is used to invoke user operator with loaded element arguments. + * + * device interface: + * void thread_init(uint32_t idx); + * + * void on(uint32_t idx); + * + * void next(); + */ + + /*! + * \brief call user op directly without visiting any params (i.e. arity == + * 0) + */ + template + struct OpCallerNull { + Op op; + + devfunc void thread_init(uint32_t) { + } + + devfunc void on(uint32_t idx) { + op(idx); + } + + devfunc void next() { + } + }; + + /*! + * \brief call an operator whose each param are promted to the same ndim and + * brdcast_mask + * \tparam PVis ParamElemVisitor class + */ + template + struct OpCallerUniform; + + //! specialization for arity == 1 + template + struct OpCallerUniform { + Op op; + PVis par[1]; + static const uint32_t packed_size = PVis::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + } + + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].at(idx)); + } + + devfunc void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, ptr0[par[0].offset(idx + i)]); + } + } + } + + devfunc void next() { + par[0].next(); + } + }; + //! specialization for arity == 2 + template + struct OpCallerUniform { + Op op; + PVis par[2]; + static const uint32_t packed_size = PVis::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + par[1].thread_init(idx); + } + + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].at(idx), par[1].at(idx)); + } + + devfunc void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].at(idx), par[1].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + auto ptr1 = par[1].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, ptr0[par[0].offset(idx + i)], + ptr1[par[1].offset(idx + i)]); + } + } + } + + devfunc void next() { + par[0].next(); + par[1].next(); + } + }; + //! specialization for arity == 3 + template + struct OpCallerUniform { + Op op; + PVis par[3]; + static const uint32_t packed_size = PVis::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + par[1].thread_init(idx); + par[2].thread_init(idx); + } + + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx)); + } + + devfunc void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + auto ptr1 = par[1].ptr(); + auto ptr2 = par[2].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, ptr0[par[0].offset(idx + i)], + ptr1[par[1].offset(idx + i)], + ptr2[par[2].offset(idx + i)]); + } + } + } + + devfunc void next() { + par[0].next(); + par[1].next(); + par[2].next(); + } + }; + + /*! + * \brief call binary (i.e. arity == 2) operator with different param + * visitors + */ + template + struct OpCallerBinary { + Op op; + PVis0 par0; + PVis1 par1; + MEGDNN_STATIC_ASSERT(PVis0::packed_size == PVis1::packed_size, + "vector size mismatch") + + static const uint32_t packed_size = PVis0::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par0.thread_init(idx); + par1.thread_init(idx); + } + + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par0.at(idx), par1.at(idx)); + } + + devfunc void next() { + par0.next(); + par1.next(); + } + }; + + /* f}}} */ + + template + __global__ void cuda_kern(OpCaller op_caller, uint32_t size) { + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x, + delta = blockDim.x * gridDim.x; + // each thread works on at most 3 elements; see get_launch_spec + op_caller.thread_init(idx); + if (idx < size) { + op_caller.on(idx); + idx += delta; + if (idx < size) { + op_caller.next(); + op_caller.on(idx); + idx += delta; + if (idx < size) { + op_caller.next(); + op_caller.on(idx); + } + } + } + } + + template + __global__ void cuda_kern(OpCallerUniform op_caller, + uint32_t size) { + constexpr uint32_t packed_size = PVis::packed_size; + const uint32_t size_packed = DIVUP(size, packed_size); + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x, + delta = blockDim.x * gridDim.x; + if (idx < size_packed) { + op_caller.on(idx, size - packed_size * idx); + idx += delta; + if (idx < size_packed) { + op_caller.on(idx, size - packed_size * idx); + idx += delta; + if (idx < size_packed) { + op_caller.on(idx, size - packed_size * idx); + } + } + } + } + + //! invoke a user Op passed to run_elemwise + template + class UserOpInvoker; + + /* f{{{ UserOpInvoker specializations */ + + //! run op by promoting all params to same ndim + template + class UserOpInvokerToSameNdim { + const ElemwiseOpParamN &m_param; + cudaStream_t m_stream; + const Op &m_op; + + void dispatch0() { + switch(m_param.max_ndim) { +#define cb(ndim) \ + case ndim: return dispatch1(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + on_bad_ndim(m_param.max_ndim); + } + + template + void dispatch1() { + typedef OpCallerUniform< + Op, arity, + ParamElemVisitor> + Caller; + size_t size = m_param.size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } + + public: + UserOpInvokerToSameNdim(const ElemwiseOpParamN& param, + cudaStream_t stream, const Op& op) + : m_param(param), m_stream(stream), m_op(op) { + dispatch0(); + } + }; + + template + class UserOpInvokerToSameNdimIByteHelper { + public: + UserOpInvokerToSameNdimIByteHelper(const ElemwiseOpParamN& param, + cudaStream_t stream, const Op& op) + : m_rw_size(param.size), + m_param(param), + m_stream(stream), + m_op(op) { + if (!try_vect_load_store_contiguous() && !try_vect_load_store()) { + dispatch0(); + } + } + + private: + const ElemwiseOpParamN& m_param; + size_t m_rw_size; + cudaStream_t m_stream; + const Op& m_op; + using vect_type = typename VectTypeTrait::vect_type; + static const size_t packed_size = VectTypeTrait::packed_size; + + void dispatch0() { + switch (m_param.max_ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch1(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + on_bad_ndim(m_param.max_ndim); + } + + void dispatch0_vect() { + switch (m_param.max_ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch1_vect(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + on_bad_ndim(m_param.max_ndim); + } + + void dispatch_contiguous() { + typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, + m_param.size); + after_kernel_launch(); + } + + template + void dispatch1() { + typedef ParamElemVisitor PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } + + template + void dispatch1_vect() { + typedef ParamVectVisitor PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } + + bool try_vect_load_store() { + auto try_last_contig = [](const TensorLayout& layout) { + return layout.stride[layout.ndim - 1] == 1 && + layout[layout.ndim - 1] % packed_size == 0; + }; + /* + * \NOTE: remove try_scalar() to adapt multi-type tenary op + */ + for (int i = 0; i < arity; ++i) { + if (!try_last_contig(m_param[i].layout)) return false; + } + m_rw_size /= packed_size; + dispatch0_vect(); + return true; + } + + bool try_vect_load_store_contiguous() { + auto try_contig = [](const TensorLayout& layout) { + return (layout.is_contiguous()); + }; + for (int i = 0; i < arity; ++i) { + if (!try_contig(m_param[i].layout)) + return false; + } + m_rw_size = DIVUP(m_rw_size, packed_size); + dispatch_contiguous(); + return true; + } + }; + +#define INST_DT_IBYTE(ctype) \ + template \ + class UserOpInvokerToSameNdim \ + : public UserOpInvokerToSameNdimIByteHelper { \ + using Super = UserOpInvokerToSameNdimIByteHelper; \ + \ + public: \ + UserOpInvokerToSameNdim(const ElemwiseOpParamN& param, \ + cudaStream_t stream, const Op& op) \ + : Super{param, stream, op} {} \ + } + INST_DT_IBYTE(dt_int8); + INST_DT_IBYTE(dt_uint8); + INST_DT_IBYTE(dt_qint8); + INST_DT_IBYTE(dt_quint8); +#undef INST_DT_IBYTE + + //! implement general case by UserOpInvokerToSameNdim + template + class UserOpInvoker: public UserOpInvokerToSameNdim { + public: + UserOpInvoker( + const ElemwiseOpParamN ¶m, + cudaStream_t stream, + const Op &op): + UserOpInvokerToSameNdim(param, stream, op) + { + } + }; + + //! specialization for arity == 0 + template + class UserOpInvoker { + public: + UserOpInvoker( + const ElemwiseOpParamN<0> ¶m, + cudaStream_t stream, + const Op &op) { + size_t size = param.size; + typedef OpCallerNull Caller; + Caller caller; + caller.op = op; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + (*fptr) <<< grid_size, block_size, 0, stream >>> (caller, size); + after_kernel_launch(); + } + }; + +#define DEFINE_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, _stride) \ + _cb_header(1) { \ + const ptrdiff_t *stride = _stride; \ + if (!stride[0]) { \ + return _cb_dispatch(1, BCAST_FULL); \ + } \ + _cb_dispatch(1, BCAST_OTHER); \ + } \ + _cb_header(2) { \ + const ptrdiff_t *stride = _stride; \ + if (!stride[0] && stride[1]) { \ + return _cb_dispatch(2, BCAST_10); \ + } \ + if (stride[0] && !stride[1]) { \ + return _cb_dispatch(2, BCAST_01); \ + } \ + _cb_dispatch(2, BCAST_OTHER); \ + } \ + _cb_header(3) { \ + const ptrdiff_t *stride = _stride; \ + if (!stride[0] && stride[1] && !stride[2]) { \ + return _cb_dispatch(3, BCAST_101); \ + } \ + _cb_dispatch(3, BCAST_OTHER); \ + } + + //! specialization for binary opr + template + class UserOpInvoker { + bool m_invoked; + const ElemwiseOpParamN<2> &m_param; + cudaStream_t m_stream; + const Op &m_op; + + void fallback() { + megdnn_assert(!m_invoked); + UserOpInvokerToSameNdim(m_param, m_stream, m_op); + m_invoked = true; + } + + void dispatch0() { + switch(m_param[0].layout.ndim) { +#define cb(ndim) \ + case ndim: return dispatch1_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) +#undef cb + } + fallback(); + } + +#define cb_header(ndim) void dispatch1_##ndim() +#define cb_dispatch(ndim, brdcast_mask) \ + dispatch2 >() +DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[0].layout.stride) +#undef cb_header +#undef cb_dispatch + + + template + void dispatch2() { + switch(m_param[1].layout.ndim) { +#define cb(ndim) \ + case ndim: return dispatch3_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) +#undef cb + } + fallback(); + } + +#define cb_header(ndim) \ + template \ + void dispatch3_##ndim() +#define cb_dispatch(ndim, brdcast_mask) \ + do_run >() +DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[1].layout.stride) +#undef cb_header +#undef cb_dispatch + + template + void do_run() { + megdnn_assert(!m_invoked); + m_invoked = true; + typedef OpCallerBinary Caller; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + size_t size = m_param.size; + get_launch_spec(reinterpret_cast(fptr), + size, &grid_size, &block_size); + Caller caller; + caller.op = m_op; + caller.par0.host_init(m_param[0], grid_size, block_size); + caller.par1.host_init(m_param[1], grid_size, block_size); + (*fptr) <<< grid_size, block_size, 0, m_stream >>> (caller, size); + after_kernel_launch(); + } + + public: + UserOpInvoker(const ElemwiseOpParamN<2> ¶m, cudaStream_t stream, + const Op &op): + m_param(param), m_stream(stream), m_op(op) + { + m_invoked = false; + dispatch0(); + megdnn_assert(m_invoked); + } + }; + +#define DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, \ + _stride) \ + DEFINE_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, _stride) \ + _cb_header(4) { \ + const ptrdiff_t* stride = _stride; \ + if (!stride[0] && stride[1] && !stride[2] && stride[3]) { \ + return _cb_dispatch(4, BCAST_1010); \ + } \ + _cb_dispatch(4, BCAST_OTHER); \ + } + + template + class UserOpInvokerBinaryIByteHelper { + private: + bool m_invoked; + size_t m_rw_size; + const ElemwiseOpParamN<2>& m_param; + cudaStream_t m_stream; + const Op& m_op; + using vect_type = typename VectTypeTrait::vect_type; + static const size_t packed_size = VectTypeTrait::packed_size; + bool try_vect_load_store() { + auto try_last_contig_or_scalar = [](const TensorLayout& layout) { + return (layout.stride[layout.ndim - 1] == 1 && + layout[layout.ndim - 1] % packed_size == 0) || + (layout.ndim == 1 && layout.stride[0] == 0); + }; + for (int i = 0; i < 2; ++i) { + if (!try_last_contig_or_scalar(m_param[i].layout)) + return false; + } + m_rw_size /= packed_size; + dispatch0_vect(); + return true; + } + + bool try_vect_load_store_contiguous() { + auto try_contig = [](const TensorLayout& layout) { + return (layout.is_contiguous()); + }; + for (int i = 0; i < 2; ++i) { + if (!try_contig(m_param[i].layout)) + return false; + } + m_rw_size = DIVUP(m_rw_size, packed_size); + dispatch_contiguous(); + return true; + } + + void dispatch0() { + switch (m_param[0].layout.ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch1_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) +#undef cb + } + fallback(); + } + + void dispatch0_vect() { + switch (m_param[0].layout.ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch1_vect_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) +#undef cb + case 4: + return dispatch1_vect_4(); + } + fallback(); + } + + void dispatch_contiguous() { + m_invoked = true; + typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < 2; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, + m_param.size); + after_kernel_launch(); + } + + void fallback() { + megdnn_assert(!m_invoked); + UserOpInvokerToSameNdim(m_param, m_stream, m_op); + m_invoked = true; + } + +#define cb_header(ndim) void dispatch1_##ndim() +#define cb_dispatch(ndim, brdcast_mask) \ + dispatch2>() + DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[0].layout.stride) +#undef cb_header +#undef cb_dispatch + +#define cb_header(ndim) void dispatch1_vect_##ndim() +#define cb_dispatch(ndim, brdcast_mask) \ + dispatch2_vect>() + DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[0].layout.stride) +#undef cb_header +#undef cb_dispatch + + template + void dispatch2() { + switch (m_param[1].layout.ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch3_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) +#undef cb + } + fallback(); + } + + template + void dispatch2_vect() { + switch (m_param[1].layout.ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch3_vect_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) +#undef cb + case 4: + return dispatch3_vect_4(); + } + fallback(); + } + +#define cb_header(ndim) \ + template \ + void dispatch3_##ndim() +#define cb_dispatch(ndim, brdcast_mask) \ + do_run>() + DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[1].layout.stride) +#undef cb_header +#undef cb_dispatch + +#define cb_header(ndim) \ + template \ + void dispatch3_vect_##ndim() +#define cb_dispatch(ndim, brdcast_mask) \ + do_run>() + DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[1].layout.stride) +#undef cb_header +#undef cb_dispatch + + template + void do_run() { + megdnn_assert(!m_invoked); + m_invoked = true; + typedef OpCallerBinary Caller; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + size_t size = m_rw_size; + get_launch_spec(reinterpret_cast(fptr), size, + &grid_size, &block_size); + Caller caller; + caller.op = m_op; + caller.par0.host_init(m_param[0], grid_size, block_size); + caller.par1.host_init(m_param[1], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } + + public: + UserOpInvokerBinaryIByteHelper(const ElemwiseOpParamN<2>& param, + cudaStream_t stream, const Op& op) + : m_rw_size(param.size), + m_param(param), + m_stream(stream), + m_op(op) { + m_invoked = false; + if (!try_vect_load_store_contiguous() && !try_vect_load_store()) { + dispatch0(); + } + megdnn_assert(m_invoked); + } + }; + +#define INST_DT_IBYTE(ctype) \ + template \ + class UserOpInvoker \ + : public UserOpInvokerBinaryIByteHelper { \ + using Super = UserOpInvokerBinaryIByteHelper; \ + \ + public: \ + UserOpInvoker(const ElemwiseOpParamN<2>& param, cudaStream_t stream, \ + const Op& op) \ + : Super{param, stream, op} {} \ + } + INST_DT_IBYTE(dt_int8); + INST_DT_IBYTE(dt_uint8); + INST_DT_IBYTE(dt_qint8); + INST_DT_IBYTE(dt_quint8); +#undef INST_DT_IBYTE +#endif + +#undef DEFINE_BRDCAST_DISPATCH_RECEIVERS +#undef DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS + + /* f}}} */ + +#undef devfunc +} // namespace elemwise_intl + +/*! + * \brief general element-wise kernel launcher + * + * \tparam arity number of params for the operator + * \param param param values for the operator; must have been initialized (i.e. + * by calling ElemwiseOpParamN::init_from_given_tensor). The params + * can have arbitrary layouts, as long as they share the same total number + * of elements. + * \param op callable with a signature compatible with + * `void op(uint32_t idx, ctype& param0, ..., ctype& param[arity - 1])` + * if arity == 0, there is only an `idx` input + * if ctype=dt_int8, dt_uint8, dt_qint8, dt_quint8, a signature compatible + * with `void op(uint32_t idx, vect_type& param0, ..., ctype& param[arity - 1])` + * should be implemented + */ +template +void run_elemwise(const ElemwiseOpParamN& param, cudaStream_t stream, + const Op& op = Op()); + +#if MEGDNN_CC_CUDA +template +void run_elemwise( + const ElemwiseOpParamN ¶m, cudaStream_t stream, + const Op &op) { + param.assert_initialized(); + elemwise_intl::UserOpInvoker(param, stream, op); +} + +/*! + * \brief explicit instantialization of run_elemwise for given template params; + * used in .cu files, so corresponding run_elemwise can be called from .cpp + */ +#define INST_RUN_ELEMWISE(Op, ctype, arity) \ +template void run_elemwise( \ + const ElemwiseOpParamN&, cudaStream_t, const Op&) + +#endif + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} + + diff --git a/dnn/src/cuda/elemwise_multi_type/kern.cu b/dnn/src/cuda/elemwise_multi_type/kern.cu new file mode 100644 index 00000000..b6e9b11e --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kern.cu @@ -0,0 +1,105 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/elemwise_helper.cuh" +#include "src/cuda/elemwise_multi_type/kern.cuh" +#include "src/cuda/elemwise_multi_type/kern_ops.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace elemwise_multi_type; +using namespace elemwise_intl; +using namespace kern_ops; + +void elemwise_multi_type::fma3_int16x32x32x32_1c1( + const ElemwiseOpParamN<3>& param, dt_int32* dst, cudaStream_t stream) { + typedef Fma3Int16x32x32x32Bcast101Op Caller; + void (*fptr)(Caller, uint32_t) = cuda_kern; + int grid_size, block_size; + get_launch_spec(reinterpret_cast(fptr), param.size, &grid_size, + &block_size); + + Caller caller; + caller.a.host_init(param[0], grid_size, block_size); + caller.b.host_init(param[1], grid_size, block_size); + caller.c.host_init(param[2], grid_size, block_size); + caller.dst = dst; + + (*fptr)<<>>(caller, param.size); + after_kernel_launch(); +} + +template +void elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar( + const ElemwiseOpParamN<2>& param, dst_type* dst, cudaStream_t stream) { + typedef RoundShrSaturateIXxBcastScalarOp Caller; + void (*fptr)(Caller, uint32_t) = cuda_kern; + int grid_size, block_size; + get_launch_spec(reinterpret_cast(fptr), param.size, &grid_size, + &block_size); + + Caller caller; + caller.a.host_init(param[0], grid_size, block_size); + caller.b.host_init(param[1], grid_size, block_size); + caller.dst = dst; + + (*fptr)<<>>(caller, param.size); + after_kernel_launch(); +} + +#define INST(stype) \ + template void \ + elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar( \ + const ElemwiseOpParamN<2>& param, dt_int8*, cudaStream_t) +INST(int32_t); +INST(int16_t); +INST(int8_t); +#undef INST + +#define INST(stype) \ + template void \ + elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar( \ + const ElemwiseOpParamN<2>& param, dt_int16*, cudaStream_t) +INST(int32_t); +INST(int16_t); +#undef INST + +template +void elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11( + const ElemwiseOpParamN<6>& param, dt_int8* dst, cudaStream_t stream) { + typedef FuseAddRmulhRoundingShrBcastScalarOp Caller; + void (*fptr)(Caller, uint32_t) = cuda_kern; + int grid_size, block_size; + get_launch_spec(reinterpret_cast(fptr), param.size, &grid_size, + &block_size); + + Caller caller; + caller.x.host_init(param[0], grid_size, block_size); + caller.b.host_init(param[1], grid_size, block_size); + caller.M.host_init(param[2], grid_size, block_size); + caller.k.host_init(param[3], grid_size, block_size); + caller.minv.host_init(param[4], grid_size, block_size); + caller.maxv.host_init(param[5], grid_size, block_size); + caller.dst = dst; + + (*fptr)<<>>(caller, param.size); + after_kernel_launch(); +} + +#define INST(stype) \ + template void \ + elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11( \ + const ElemwiseOpParamN<6>& param, dt_int8*, cudaStream_t) +INST(int32_t); +INST(int16_t); +#undef INST + +// vim: ft=cuda syntax=cuda.doxygen diff --git a/dnn/src/cuda/elemwise_multi_type/kern.cuh b/dnn/src/cuda/elemwise_multi_type/kern.cuh new file mode 100644 index 00000000..32094644 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kern.cuh @@ -0,0 +1,43 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "include/megdnn/thin/small_vector.h" +#include "src/common/elemwise_helper.cuh" +#include "src/cuda/utils.cuh" +#include "src/common/elemwise/kern_defs.cuh" + +namespace megdnn { +namespace cuda { +namespace elemwise_multi_type { +//! a * b + c, where a is [s0, s1, s2] and b, c both [1, s1, 1] +void fma3_int16x32x32x32_1c1(const ElemwiseOpParamN<3>& param, dt_int32* dst, + cudaStream_t stream); + +//! a * b + c, where a is [m, n] and b, c both [1, n]; m can be 1 +template +void fma3_iXxf32xf32xi8_bcast_1x(const stype* a, const float* b, const float* c, + dt_int8* dst, uint32_t m, uint32_t n, + cudaStream_t stream); + +template +void round_shr_saturate_iXxi8xiX_scalar(const ElemwiseOpParamN<2>& param, + dst_ctype* dst, cudaStream_t stream); + +template +void fuse_add_rmulh_round_shr_saturate_bcast_1c11( + const ElemwiseOpParamN<6>& param, dt_int8* dst, cudaStream_t stream); + +} // namespace elemwise_multi_type +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise_multi_type/kern_iXxf32xf32xi8.cu b/dnn/src/cuda/elemwise_multi_type/kern_iXxf32xf32xi8.cu new file mode 100644 index 00000000..4f503d81 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kern_iXxf32xf32xi8.cu @@ -0,0 +1,129 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kern_iXxf32xf32xi8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./kern.cuh" + +#include "megdnn/dtype.h" +#include "src/common/elemwise_multi_type/kern_defs.cuh" +#include "src/cuda/utils.cuh" + +using namespace megdnn; + +namespace { + +template +struct __builtin_align__(sizeof(T) * 4) Packed4 { + T v[4]; +}; + +template +__global__ void kern_1d(const stype* x, const float* k, const float* b, + dtype* y, uint32_t n) { + elemwise_multi_type::Fma3iXxf32xf32xiYOp op; + uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < n) { + y[i] = op(x[i], k[i], b[i]); + } +} + +template +void invoke_kern_1d(const stype* x, const float* k, const float* b, dtype* y, + uint32_t n, cudaStream_t stream) { + dim3 threads = NR_THREADS; + dim3 blocks = DIVUP(n, NR_THREADS); + kern_1d<<>>(x, k, b, y, n); + after_kernel_launch(); +} + +template +__global__ void kern_2d_fallback(const stype* x, const float* k, const float* b, + dtype* y, uint32_t m, uint32_t n) { + uint32_t i = threadIdx.y + blockIdx.y * blockDim.y; + uint32_t j = threadIdx.x + blockIdx.x * blockDim.x; + elemwise_multi_type::Fma3iXxf32xf32xiYOp op; + if (i < m && j < n) { + y[i * n + j] = op(x[i * n + j], k[j], b[j]); + } +} + +template +__global__ void kern_2d_mul4(const stype* __restrict x, + const float* __restrict k, + const float* __restrict b, dtype* y_, uint32_t m, + uint32_t n) { + uint32_t i = threadIdx.y + blockIdx.y * blockDim.y; + uint32_t j = threadIdx.x + blockIdx.x * blockDim.x; + elemwise_multi_type::Fma3iXxf32xf32xiYOp op; + Packed4* __restrict__ y = (Packed4*)y_; + if (i < m && j < n) { + stype x0 = x[(i * n + j) * 4 + 0]; + stype x1 = x[(i * n + j) * 4 + 1]; + stype x2 = x[(i * n + j) * 4 + 2]; + stype x3 = x[(i * n + j) * 4 + 3]; + float k0 = k[j * 4 + 0]; + float k1 = k[j * 4 + 1]; + float k2 = k[j * 4 + 2]; + float k3 = k[j * 4 + 3]; + float b0 = b[j * 4 + 0]; + float b1 = b[j * 4 + 1]; + float b2 = b[j * 4 + 2]; + float b3 = b[j * 4 + 3]; + Packed4 pack; + pack.v[0] = op(x0, k0, b0); + pack.v[1] = op(x1, k1, b1); + pack.v[2] = op(x2, k2, b2); + pack.v[3] = op(x3, k3, b3); + y[i * n + j] = pack; + } +} + +template +void invoke_kern_2d(const stype* x, const float* k, const float* b, dtype* y, + uint32_t m, uint32_t n, cudaStream_t stream) { + if (n % 4 == 0 && is_same::value) { + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + dim3 blocks(DIVUP(n / 4, NR_THREADS_X), DIVUP(m, NR_THREADS_Y)); + // each thread process 4 elems + // template to avoid compile error + kern_2d_mul4 + <<>>(x, k, b, y, m, n / 4); + } else { + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + dim3 blocks(DIVUP(n, NR_THREADS_X), DIVUP(m, NR_THREADS_Y)); + kern_2d_fallback + <<>>(x, k, b, y, m, n); + after_kernel_launch(); + } +} + +} // anonymous namespace + +using namespace megdnn; + +template +void cuda::elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( + const stype* x, const float* k, const float* b, dt_int8* y, uint32_t m, + uint32_t n, cudaStream_t stream) { + if (m == 1) { + invoke_kern_1d(x, k, b, y, n, stream); + } else { + invoke_kern_2d(x, k, b, y, m, n, stream); + } +} + +#define INST(stype) \ + template void \ + cuda::elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( \ + const stype*, const float*, const float*, dt_int8*, uint32_t, \ + uint32_t, cudaStream_t) +#define cb(t) INST(DTypeTrait::ctype); +MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) +#undef cb +#undef INST diff --git a/dnn/src/cuda/elemwise_multi_type/kern_impl.inl b/dnn/src/cuda/elemwise_multi_type/kern_impl.inl new file mode 100644 index 00000000..23553a4d --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kern_impl.inl @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kern_impl.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#ifndef KERN_IMPL_MODE +#error "KERN_IMPL_MODE, KERN_IMPL_ARITY, KERN_IMPL_STYPE, KERN_IMPL_DTYPE must be defined" +#endif + +#include "src/cuda/elemwise_multi_type/kern_ops.cuh" + +namespace megdnn { +namespace cuda { + +#define cb(_m) \ + typedef ElemwiseKern \ + KernImpl; \ + typedef kern_ops_quantized::QuantizedMultiTypeOp< \ + KERN_IMPL_ARITY, KERN_IMPL_STYPE, KERN_IMPL_DTYPE, KernImpl> \ + Op; \ + INST_RUN_ELEMWISE(Op, KERN_IMPL_STYPE, KERN_IMPL_ARITY); + +KERN_IMPL_MODE(cb) + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh new file mode 100644 index 00000000..bd5fd62a --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh @@ -0,0 +1,285 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kern_ops.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "src/cuda/elemwise_helper.cuh" +#include "src/cuda/elemwise_multi_type/kern.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +using namespace elemwise_intl; + +namespace kern_ops { + +//! a * b + c, where a is [x, y, z] and b, c both [1, y, 1] +struct Fma3Int16x32x32x32Bcast101Op { + ParamElemVisitor<1, dt_int16, BCAST_OTHER> a; + ParamElemVisitor<3, dt_int32, BCAST_101> b, c; + + dt_int32* dst; + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void thread_init(uint32_t idx) { + a.thread_init(idx); + b.thread_init(idx); + c.thread_init(idx); + } + + __device__ __forceinline__ void on(uint32_t idx) { + dst[idx] = a.at(idx) * b.at(idx) + c.at(idx); + } + + __device__ __forceinline__ void next() { + a.next(); + b.next(); + c.next(); + } +#endif +}; + +template +struct RoundShrSaturateIXxBcastScalarOp { + ParamElemVisitor<1, stype, BCAST_OTHER> a; + ParamElemVisitor<1, dt_int8, BCAST_FULL> b; + + dst_type* dst; + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void thread_init(uint32_t idx) { + a.thread_init(idx); + b.thread_init(idx); + } + + __device__ __forceinline__ void on(uint32_t idx) { + stype result = + rounding_shift_right_away_from_zero(a.at(idx), b.at(idx)); + result = result < INT8_MAX ? result : INT8_MAX; + result = result > INT8_MIN ? result : INT8_MIN; + dst[idx] = static_cast(result); + } + + __device__ __forceinline__ void next() { + a.next(); + b.next(); + } +#endif +}; + +template +struct FuseAddRmulhRoundingShrBcastScalarOp { + ParamElemVisitor<1, stype, BCAST_OTHER> x; + ParamElemVisitor<3, stype, BCAST_101> b; + ParamElemVisitor<1, stype, BCAST_FULL> M; + ParamElemVisitor<1, dt_int8, BCAST_FULL> k; + ParamElemVisitor<1, dt_int8, BCAST_FULL> minv; + ParamElemVisitor<1, dt_int8, BCAST_FULL> maxv; + + dt_int8* dst; + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ void thread_init(uint32_t idx) { + x.thread_init(idx); + b.thread_init(idx); + M.thread_init(idx); + k.thread_init(idx); + minv.thread_init(idx); + maxv.thread_init(idx); + } + + __device__ __forceinline__ void on(uint32_t idx) { + stype result = rounding_shift_right_away_from_zero( + round_mulh_saturate(x.at(idx) + b.at(idx), M.at(idx)), + k.at(idx)); + stype lminv = minv.at(idx); + stype lmaxv = maxv.at(idx); + result = lminv < result ? result : lminv; + result = result < lmaxv ? result : lmaxv; + dst[idx] = static_cast(result); + } + + __device__ __forceinline__ void next() { + x.next(); + b.next(); + } +#endif +}; +} // namespace kern_ops + +#ifndef MEGDNN_ELEMWISE_MODE_ENABLE +#define MEGDNN_ELEMWISE_MODE_ENABLE(_mode, _cb) _cb(_mode) +#endif + +namespace kern_ops_quantized { + +template +struct QuantizedMultiTypeOp; + +template +struct QuantizedMultiTypeOp< + 1, ctype_src, ctype_dst, KernImpl, + typename std::enable_if< + std::is_same::value || + std::is_same::value || + std::is_same::value>::type> { + ctype_dst* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a; + typedef typename elemwise_intl::VectTypeTrait::vect_type + src_vect_type; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + ctype_dst* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ ctype_dst apply(ctype_src v1) { + float fv1 = param_a.dequantize(v1); + float rv = KernImpl::apply(fv1); + return dst_param.quantize(rv); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a) { + dst[idx] = dst_param.quantize(KernImpl::apply(param_a.dequantize(a))); + } + + __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) { + ctype_src a_x(a.x), a_y(a.y), a_z(a.z), a_w(a.w); + ctype_dst x = apply(a_x), y = apply(a_y), z = apply(a_z), + w = apply(a_w); + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y, z, + w); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 2, ctype_src, ctype_dst, KernImpl, + typename std::enable_if< + std::is_same::value || + std::is_same::value || + std::is_same::value>::type> { + ctype_dst* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b; + typedef typename elemwise_intl::VectTypeTrait::vect_type + src_vect_type; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + ctype_dst* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ ctype_dst apply(ctype_src v1, ctype_src v2) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2); + float rv = KernImpl::apply(fv1, fv2); + return dst_param.quantize(rv); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a, + ctype_src b) { + dst[idx] = dst_param.quantize( + KernImpl::apply(param_a.dequantize(a), param_b.dequantize(b))); + } + + __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, + src_vect_type b) { + ctype_src a_x(a.x), a_y(a.y), a_z(a.z), a_w(a.w), b_x(b.x), b_y(b.y), + b_z(b.z), b_w(b.w); + ctype_dst x = apply(a_x, b_x), y = apply(a_y, b_y), z = apply(a_z, b_z), + w = apply(a_w, b_w); + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y, z, + w); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 3, ctype_src, ctype_dst, KernImpl, + typename std::enable_if< + std::is_same::value || + std::is_same::value || + std::is_same::value>::type> { + ctype_dst* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b, param_c; + typedef typename elemwise_intl::VectTypeTrait::vect_type + src_vect_type; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + ctype_dst* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + param_c = src_params[2]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ ctype_dst apply(ctype_src v1, ctype_src v2, + ctype_src v3) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2), + fv3 = param_c.dequantize(v3); + float rv = KernImpl::apply(fv1, fv2, fv3); + return dst_param.quantize(rv); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a, + ctype_src b, ctype_src c) { + dst[idx] = dst_param.quantize(KernImpl::apply(param_a.dequantize(a), + param_b.dequantize(b), + param_c.dequantize(c))); + } + + __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, + src_vect_type b, + src_vect_type c) { + ctype_src a_x(a.x), a_y(a.y), a_z(a.z), a_w(a.w), b_x(b.x), b_y(b.y), + b_z(b.z), b_w(b.w), c_x(c.x), c_y(c.y), c_z(c.z), c_w(c.w); + ctype_dst x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y), + z = apply(a_z, b_z, c_z), w = apply(a_w, b_w, c_w); + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y, z, + w); + } +#endif +}; + +} // namespace kern_ops_quantized + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_GRAD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_GRAD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..f9ac3e13 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_GRAD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ABS_GRAD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..f591e9dc --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ACOS_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ACOS_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..c8217765 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ACOS_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ACOS_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..b4f2d5e0 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..a73a3406 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..af7e9383 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ASIN_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ASIN_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..596c89e0 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ASIN_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ASIN_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ATAN2_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ATAN2_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..eb88c18e --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ATAN2_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ATAN2_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..db1fa329 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..d17655c6 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/COS_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/COS_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..c9337062 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/COS_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/COS_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..933e121b --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ERFCINV_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ERFCINV_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..966878ac --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ERFCINV_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ERFCINV_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ERFC_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ERFC_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..c0184be1 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ERFC_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ERFC_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ERFINV_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ERFINV_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..52394e1f --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ERFINV_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ERFINV_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFINV, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ERF_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ERF_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..ae746f46 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ERF_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ERF_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERF, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/EXPM1_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/EXPM1_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..707800f2 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/EXPM1_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/EXPM1_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXPM1, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/EXP_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/EXP_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..7827e97d --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/EXP_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/EXP_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXP, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_GRAD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_GRAD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..48bdc3ad --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_GRAD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_GRAD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..14cb7067 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..fffe5efb --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..b5e25b6c --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_DIV_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_DIV_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..ecb2df4c --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_DIV_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_DIV_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..d6e6e3e4 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..836b814e --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..e50c6b18 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..7d0f6775 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..56fdddf3 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..78be51a0 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..4a0a4394 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..007a2a77 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..a6234355 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..6d5ce87e --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..59d6d3f5 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..30b258bd --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..da877e26 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..3ecaab9c --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, cb) +#define KERN_IMPL_ARITY 3 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_GRAD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_GRAD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..91df6e84 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_GRAD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_GRAD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..f2a4560d --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..bbe79fbb --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..785e1c86 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..65c4622f --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LOG1P_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LOG1P_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..d6547094 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/LOG1P_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/LOG1P_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG1P, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LOG_SUM_EXP_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LOG_SUM_EXP_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..bcc5e12b --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/LOG_SUM_EXP_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/LOG_SUM_EXP_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG_SUM_EXP, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LOG_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LOG_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..0e04b0ba --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/LOG_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/LOG_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..f2c63bf7 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..964a96dc --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..434ff151 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MOD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MOD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..60f1caf7 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/MOD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/MOD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..28f5a50f --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..75e95afb --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/POW_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/POW_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..aafeb2ae --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/POW_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/POW_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(POW, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..30d9e12f --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..819b3a49 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..3f2302fd --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..21338e97 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_GRAD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_GRAD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..d3310780 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_GRAD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_GRAD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..4263b17c --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..073beb3e --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..d87c009c --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIN_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIN_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..10497c4f --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SIN_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SIN_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIN, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..2292d5d1 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..e3b3dd2b --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_GRAD_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_GRAD_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..2170e7ce --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_GRAD_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/TANH_GRAD_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint8.cu new file mode 100644 index 00000000..dae0822c --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint32 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint32.cu new file mode 100644 index 00000000..89d7333d --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint32.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint32.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint32 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..1dbba144 --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) +#define KERN_IMPL_ARITY 1 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TRUE_DIV_dt_qint8_dt_qint8.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TRUE_DIV_dt_qint8_dt_qint8.cu new file mode 100644 index 00000000..7666159d --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kimpl/TRUE_DIV_dt_qint8_dt_qint8.cu @@ -0,0 +1,16 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kimpl/TRUE_DIV_dt_qint8_dt_qint8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// generated by gen_elemwise_multi_type_kern_impls.py +#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TRUE_DIV, cb) +#define KERN_IMPL_ARITY 2 +#define KERN_IMPL_STYPE dt_qint8 +#define KERN_IMPL_DTYPE dt_qint8 +#include "../kern_impl.inl" diff --git a/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp b/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp new file mode 100644 index 00000000..3b479d2a --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp @@ -0,0 +1,445 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/tensor_iter.h" + +#include "src/common/elemwise/each_mode.inl" +#include "src/cuda/elemwise_multi_type/kern.cuh" +#include "src/cuda/elemwise_multi_type/kern_ops.cuh" +#include "src/cuda/elemwise_multi_type/opr_impl.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32( + const ElemwiseOpParamN<3>& param, dt_int32* dst) { + BroadcastChannelInfo binfo0, binfo1; + if (is_vector(param[0].layout) && + is_broadcasted_channel_like(param[1].layout, binfo0) && + is_broadcasted_channel_like(param[2].layout, binfo1) && + binfo0 == binfo1) { + elemwise_multi_type::fma3_int16x32x32x32_1c1( + param, dst, cuda_stream(this->handle())); + return; + } + megdnn_throw("unsupported fma3 int16x32x32x32 layout"); +} + +void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8( + const ElemwiseOpParamN<3>& param, dt_int8* dst) { + Broadcast1xInfo binfo0, binfo1; + auto p1 = param[1].ptr(), p2 = param[2].ptr(); + auto stream = cuda_stream(this->handle()); + if (is_vector(param[0].layout) && + is_broadcasted_1x(param[1].layout, binfo0) && + is_broadcasted_1x(param[2].layout, binfo1) && binfo0 == binfo1) { + switch (param[0].layout.dtype.enumv()) { +#define cb(t) \ + case DTypeTrait::enumv: \ + elemwise_multi_type::fma3_iXxf32xf32xi8_bcast_1x( \ + param[0].ptr::ctype>(), p1, p2, dst, binfo0.x, \ + binfo0.y, stream); \ + return; + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) +#undef cb + default: + megdnn_throw("bad dtype"); + } + return; + } + megdnn_throw("unsupported fma3 iXxf32xf32xi8 layout"); +} + +void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8( + const ElemwiseOpParamN<2>& param, dt_int8* dst) { + auto stream = cuda_stream(this->handle()); + if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { + switch (param[0].layout.dtype.enumv()) { +#define DISPATCH(t) \ + case DTypeTrait::enumv: \ + elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \ + DTypeTrait::ctype, dt_int8>(param, dst, stream); \ + return; + DISPATCH(::megdnn::dtype::Int32) + DISPATCH(::megdnn::dtype::Int16) + DISPATCH(::megdnn::dtype::Int8) +#undef DISPATCH + default: + megdnn_throw( + "Unsupported data type for ElemwiseMultiType " + "(Mode=ROUND_SHR_SATURATE_IXxI8xI8): need an integer " + "tensor"); + } + } + megdnn_throw( + "Unsupported input layout for ElemwiseMultiType " + "(Mode=ROUND_SHR_SATURATE_IXxI8xI8): need a contiguous src[0] and " + "a scalar src[1]"); +} + +void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( + const ElemwiseOpParamN<6>& param, dt_int8* dst) { + auto stream = cuda_stream(this->handle()); + BroadcastChannelInfo info; + if (is_vector(param[0].layout) && + is_broadcasted_channel_like(param[1].layout, info) && + is_broadcasted_scalar(param[2].layout) && + is_broadcasted_scalar(param[3].layout) && + is_broadcasted_scalar(param[4].layout) && + is_broadcasted_scalar(param[5].layout)) { + elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11< + dt_int16>(param, dst, stream); + return; + } + megdnn_throw( + "Unsupported input layout for ElemwiseMultiType " + "(Mode=FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8): the first " + "and the second input should be contiguous, the others should be " + "scalar."); +} + +void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( + const ElemwiseOpParamN<6>& param, dt_int8* dst) { + auto stream = cuda_stream(this->handle()); + BroadcastChannelInfo info; + if (is_vector(param[0].layout) && + is_broadcasted_channel_like(param[1].layout, info) && + is_broadcasted_scalar(param[2].layout) && + is_broadcasted_scalar(param[3].layout) && + is_broadcasted_scalar(param[4].layout) && + is_broadcasted_scalar(param[5].layout)) { + elemwise_multi_type::fuse_add_rmulh_round_shr_saturate_bcast_1c11< + dt_int32>(param, dst, stream); + return; + } + megdnn_throw( + "Unsupported input layout for ElemwiseMultiType " + "(Mode=FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8): the first " + "and the second input should be contiguous, the others should be " + "scalar."); +} + +void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16( + const ElemwiseOpParamN<2>& param, dt_int16* dst) { + auto stream = cuda_stream(this->handle()); + if (is_vector(param[0].layout) && is_broadcasted_scalar(param[1].layout)) { + switch (param[0].layout.dtype.enumv()) { +#define DISPATCH(t) \ + case DTypeTrait::enumv: \ + elemwise_multi_type::round_shr_saturate_iXxi8xiX_scalar< \ + DTypeTrait::ctype, dt_int16>(param, dst, stream); \ + return; + DISPATCH(::megdnn::dtype::Int32) + DISPATCH(::megdnn::dtype::Int16) +#undef DISPATCH + default: + megdnn_throw( + "Unsupported data type for ElemwiseMultiType " + "(Mode=ROUND_SHR_SATURATE_IXxI8xI8): need an integer " + "tensor"); + } + } + megdnn_throw( + "Unsupported input layout for ElemwiseMultiType " + "(Mode=ROUND_SHR_SATURATE_IXxI8xI8): need a contiguous src[0] and " + "a scalar src[1]"); +} + +namespace { + +template +struct ModeDispatcher; + +#define _cb_dispatch_mode(_m) \ + case param::Elemwise::Mode::_m: \ + do { \ + using KernImpl = \ + ElemwiseKern; \ + using Op = kern_ops_quantized::QuantizedMultiTypeOp< \ + arity, src_ctype, dst_ctype, KernImpl>; \ + Op op(src_params, dst, dst_param); \ + return run_elemwise(param, stream, op); \ + } while (0); + +#define IMPL_MODE_DISPATCHER(_arity, _src_ctype, _dst_ctype) \ + template <> \ + struct ModeDispatcher<_arity, _src_ctype, _dst_ctype> { \ + static constexpr int arity = _arity; \ + using src_ctype = _src_ctype; \ + using dst_ctype = _dst_ctype; \ + static void run( \ + const ElemwiseOpParamN<_arity>& param, _dst_ctype* dst, \ + const SmallVector>& src_params, \ + const CudaDTypeParam<_dst_ctype>& dst_param, \ + param::Elemwise::Mode mode, cudaStream_t stream) { \ + megdnn_assert(src_params.size() == _arity); \ + switch (mode) { \ + FOREACH(_cb_dispatch_mode) \ + default: \ + megdnn_throw("bad mode"); \ + } \ + } \ + } + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_UNARY_FLOAT +IMPL_MODE_DISPATCHER(1, dt_qint8, dt_qint8); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_BINARY_FLOAT +IMPL_MODE_DISPATCHER(2, dt_qint8, dt_qint8); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_FLOAT +IMPL_MODE_DISPATCHER(3, dt_qint8, dt_qint8); +#undef FOREACH + +#define FOREACH(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +IMPL_MODE_DISPATCHER(1, dt_qint8, dt_qint32); +IMPL_MODE_DISPATCHER(1, dt_qint32, dt_qint8); +#undef FOREACH + +#define FOREACH(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +IMPL_MODE_DISPATCHER(2, dt_qint8, dt_qint32); +IMPL_MODE_DISPATCHER(2, dt_qint32, dt_qint8); +#undef FOREACH + +#undef _cb_dispatch_mode +#undef IMPL_MODE_DISPATCHER + +template +void dispatch_src_ctype(const ElemwiseOpParamN<1>&, const TensorND& dst_tensor, + Elemwise::Mode, cudaStream_t); + +#define DISPATCH(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + auto param_a = param[0].layout.dtype.param(); \ + auto dst_param = dst_tensor.layout.dtype.param<_dt>(); \ + ModeDispatcher<1, ctype_src, typename DTypeTrait<_dt>::ctype>::run( \ + param, dst_tensor.ptr::ctype>(), \ + {param_a}, dst_param, mode, stream); \ + break; \ + } + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<1>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint8 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS32); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<1>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint32 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS8); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +#undef DISPATCH + +#define DISPATCH(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + auto param_a = param[0].layout.dtype.param(); \ + auto param_b = param[1].layout.dtype.param(); \ + auto dst_param = dst_tensor.layout.dtype.param<_dt>(); \ + ModeDispatcher<2, ctype_src, typename DTypeTrait<_dt>::ctype>::run( \ + param, dst_tensor.ptr::ctype>(), \ + {param_a, param_b}, dst_param, mode, stream); \ + break; \ + } + +template +void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, + const TensorND& dst_tensor, Elemwise::Mode mode, + cudaStream_t stream); +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint8 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS32); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint32 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS8); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} +#undef DISPATCH + +#define DISPATCH(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + auto param_a = param[0].layout.dtype.param(); \ + auto param_b = param[1].layout.dtype.param(); \ + auto param_c = param[2].layout.dtype.param(); \ + auto dst_param = dst_tensor.layout.dtype.param<_dt>(); \ + ModeDispatcher<3, ctype_src, typename DTypeTrait<_dt>::ctype>::run( \ + param, dst_tensor.ptr::ctype>(), \ + {param_a, param_b, param_c}, dst_param, mode, stream); \ + break; \ + } + +template +void dispatch_src_ctype(const ElemwiseOpParamN<3>& param, + const TensorND& dst_tensor, Elemwise::Mode mode, + cudaStream_t stream); +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<3>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint8 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS8); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +#undef DISPATCH + +} // namespace + +void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<1>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode) { + megdnn_assert( + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8 || + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS32, + "expect inputs dtype to be qint8/qint32, but got: %s", + param[0].layout.dtype.name()); + auto stream = cuda_stream(this->handle()); + switch (param[0].layout.dtype.enumv()) { +#define DISPATCH(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + dispatch_src_ctype::ctype>(param, dst_tensor, \ + mode, stream); \ + break; \ + } + + DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS32); + + default: + megdnn_throw( + ssprintf("Unsupported input dtype %s for ElemwiseMultiType", + param[0].layout.dtype.name())); + } + +#undef DISPATCH +} + +void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<2>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode) { + megdnn_assert(param[0].layout.dtype.enumv() == + param[1].layout.dtype.enumv()); + megdnn_assert( + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8 || + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS32, + "expect inputs dtype to be qint8/qint32, but got: %s", + param[0].layout.dtype.name()); + auto stream = cuda_stream(this->handle()); + switch (param[0].layout.dtype.enumv()) { +#define DISPATCH(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + dispatch_src_ctype::ctype>(param, dst_tensor, \ + mode, stream); \ + break; \ + } + + DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS32); + + default: + megdnn_throw( + ssprintf("Unsupported input dtype %s for ElemwiseMultiType", + param[0].layout.dtype.name())); + } + +#undef DISPATCH +} + +void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<3>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode) { + megdnn_assert(param[0].layout.dtype.enumv() == + param[1].layout.dtype.enumv()); + megdnn_assert(param[0].layout.dtype.enumv() == + param[2].layout.dtype.enumv()); + + megdnn_assert( + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8, + "expect inputs dtype to be qint8, but got: %s", + param[0].layout.dtype.name()); + auto stream = cuda_stream(this->handle()); + switch (param[0].layout.dtype.enumv()) { +#define DISPATCH(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + dispatch_src_ctype::ctype>(param, dst_tensor, \ + mode, stream); \ + break; \ + } + + DISPATCH(dtype::QuantizedS8); + + default: + megdnn_throw( + ssprintf("Unsupported input dtype %s for ElemwiseMultiType", + param[0].layout.dtype.name())); + } + +#undef DISPATCH +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise_multi_type/opr_impl.h b/dnn/src/cuda/elemwise_multi_type/opr_impl.h new file mode 100644 index 00000000..e5b363ca --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/opr_impl.h @@ -0,0 +1,54 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/common/elemwise_multi_type/opr_impl_helper.h" + +namespace megdnn { +namespace cuda { + +class ElemwiseMultiTypeImpl final : public ElemwiseMultiTypeImplHelper { + void on_fuse_mul_add3_int16x32x32x32(const ElemwiseOpParamN<3>& param, + dt_int32* dst) override; + + void on_fuse_mul_add3_iXxf32xf32xi8(const ElemwiseOpParamN<3>& param, + dt_int8* dst) override; + + void on_round_shr_saturate_iXxi8xi8(const ElemwiseOpParamN<2>& param, + dt_int8* dst) override; + + void on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8( + const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + + void on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8( + const ElemwiseOpParamN<6>& param, dt_int8* dst) override; + + void on_round_shr_saturate_iXxi8xi16(const ElemwiseOpParamN<2>& param, + dt_int16* dst) override; + + void on_quantized_mode(const ElemwiseOpParamN<1>& param, + const TensorND& dst, Elemwise::Mode mode) override; + + void on_quantized_mode(const ElemwiseOpParamN<2>& param, + const TensorND& dst, Elemwise::Mode mode) override; + + void on_quantized_mode(const ElemwiseOpParamN<3>& param, + const TensorND& dst, Elemwise::Mode mode) override; + +public: + using ElemwiseMultiTypeImplHelper::ElemwiseMultiTypeImplHelper; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/error_info.cuh b/dnn/src/cuda/error_info.cuh new file mode 100644 index 00000000..58567dd7 --- /dev/null +++ b/dnn/src/cuda/error_info.cuh @@ -0,0 +1,55 @@ +/** + * \file dnn/src/cuda/error_info.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include "megcore_cdefs.h" +#include "megdnn/arch.h" + + +typedef megcore::AsyncErrorInfo AsyncErrorInfo; +#if MEGDNN_CC_CUDA +// we can not put this function into anonymous namespace, since it would cause +// unused static func or undefined static func warning depending on whether you +// define it +namespace { +#endif + +__device__ void set_async_error_info(AsyncErrorInfo* info, void* tracker, + const char* msg, int arg0 = 0, + int arg1 = 0, int arg2 = 0, int arg3 = 0) +#if MEGDNN_CC_CUDA +{ + if (info && !atomicAdd(&info->nr_error, 1)) { + // use atomic expression to ensure that only the first error is reported + info->tracker_ptr = tracker; + char* ptr = info->msg; + char* ptr_end = ptr + sizeof(AsyncErrorInfo::msg) - 1; + while (ptr < ptr_end && *msg) { + *(ptr++) = *(msg++); + } + *ptr = 0; + info->msg_args[0] = arg0; + info->msg_args[1] = arg1; + info->msg_args[2] = arg2; + info->msg_args[3] = arg3; + } +} +#else +; +#endif + +#if MEGDNN_CC_CUDA +} // anonymous namespace +#endif + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/eye/eye.cu b/dnn/src/cuda/eye/eye.cu new file mode 100644 index 00000000..fecda8ad --- /dev/null +++ b/dnn/src/cuda/eye/eye.cu @@ -0,0 +1,50 @@ +/** + * \file dnn/src/cuda/eye/eye.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/eye/eye.cuh" +#include "megdnn/dtype.h" +#include "src/cuda/utils.cuh" + +namespace { + +template +__global__ void kernel(T *dst, uint32_t m, uint32_t n, int k) +{ + int32_t i = threadIdx.x + blockIdx.x * blockDim.x; + int32_t x = i % n; + int32_t y = i / n; + if (i < m*n) { + dst[i] = (y+k == x); + } +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace eye { + +template +void exec_internal(T *dst, size_t m, size_t n, int k, cudaStream_t stream) +{ + kernel<<>>( + dst, m, n, k); + after_kernel_launch(); +} + +#define INST(T) template void exec_internal(T *, \ + size_t, size_t, int, cudaStream_t); +#define cb(DType) INST(typename DTypeTrait::ctype) +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +} // namespace eye +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/eye/eye.cuh b/dnn/src/cuda/eye/eye.cuh new file mode 100644 index 00000000..07b3a978 --- /dev/null +++ b/dnn/src/cuda/eye/eye.cuh @@ -0,0 +1,25 @@ +/** + * \file dnn/src/cuda/eye/eye.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include + +namespace megdnn { +namespace cuda { +namespace eye { + +template +void exec_internal(T *dst, size_t m, size_t n, int k, cudaStream_t stream); + +} // namespace eye +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/eye/opr_impl.cpp b/dnn/src/cuda/eye/opr_impl.cpp new file mode 100644 index 00000000..540fd652 --- /dev/null +++ b/dnn/src/cuda/eye/opr_impl.cpp @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/eye/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/eye/opr_impl.h" + +#include "src/cuda/eye/eye.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +void EyeImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) +{ + check_exec(dst.layout, workspace.size); +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + eye::exec_internal(dst.ptr(), \ + dst.layout.shape[0], dst.layout.shape[1], \ + param().k, \ + cuda_stream(handle())); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/eye/opr_impl.h b/dnn/src/cuda/eye/opr_impl.h new file mode 100644 index 00000000..0268aa8d --- /dev/null +++ b/dnn/src/cuda/eye/opr_impl.h @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/eye/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class EyeImpl final: public Eye { + public: + using Eye::Eye; + void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} + diff --git a/dnn/src/cuda/flip/flip.cu b/dnn/src/cuda/flip/flip.cu new file mode 100644 index 00000000..45d30208 --- /dev/null +++ b/dnn/src/cuda/flip/flip.cu @@ -0,0 +1,90 @@ +/** + * \file dnn/src/cuda/flip/flip.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./flip.cuh" + +#include "megdnn/dtype.h" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +static const int BX = 16; +static const int BY = 16; + +namespace { + +#define rep(i, n) for (size_t i = 0; i < (n); ++i) + +template +__global__ void flip_kern(const T *src, T *dst, size_t N, size_t H, size_t W, + size_t stride1, size_t stride2, size_t stride3) { + __shared__ T cache[BX][BY][IC]; + int ow = blockIdx.x * blockDim.x + threadIdx.x; + int oh = blockIdx.y * blockDim.y + threadIdx.y; + if (ow < W && oh < H) { + + int iw = horizontal ? W - ow - 1 : ow; + int ih = vertical ? H - oh - 1 : oh; +#pragma unroll + rep(c, IC) { + cache[threadIdx.y][threadIdx.x][c] = + src[blockIdx.z * stride1 + ih * stride2 + iw * stride3 + c]; + } + __syncthreads(); +#pragma unroll + rep(c, IC) { + dst[blockIdx.z * stride1 + oh * stride2 + ow * stride3 + c] = + cache[threadIdx.y][threadIdx.x][c]; + } + } +} + +#undef rep +} // anonymous namespace + +namespace flip { + +template +void flip(const T *src, T *dst, size_t N, size_t H, size_t W, size_t IC, + size_t stride1, size_t stride2, size_t stride3, cudaStream_t stream) { + dim3 threads(BX, BY); + dim3 blocks(DIVUP(W, BX), DIVUP(H, BY), N); + megdnn_assert(IC == 1 || IC == 3); + if (IC == 1) + flip_kern<<>>( + src, dst, N, H, W, stride1, stride2, stride3); + else + flip_kern<<>>( + src, dst, N, H, W, stride1, stride2, stride3); + after_kernel_launch(); +} + +#define INST(T, vertical, horizontal) \ + template void flip( \ + const T *src, T *dst, size_t N, size_t H, size_t W, size_t IC, \ + size_t stride1, size_t stride2, size_t stride3, cudaStream_t); + +#define cb(DType) \ + INST(typename DTypeTrait::ctype, true, true) \ + INST(typename DTypeTrait::ctype, true, false) \ + INST(typename DTypeTrait::ctype, false, true) \ + INST(typename DTypeTrait::ctype, false, false) + +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +#undef cb +#undef INST + +} // namespace flip +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/flip/flip.cuh b/dnn/src/cuda/flip/flip.cuh new file mode 100644 index 00000000..48e1ba74 --- /dev/null +++ b/dnn/src/cuda/flip/flip.cuh @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/flip/flip.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include + +namespace megdnn { +namespace cuda { +namespace flip { + +template +void flip(const T *src, T *dst, size_t N, size_t H, size_t W, size_t IC, + size_t stride1, size_t stride2, size_t stride3, cudaStream_t stream); + +} // namespace flip +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/flip/opr_impl.cpp b/dnn/src/cuda/flip/opr_impl.cpp new file mode 100644 index 00000000..8ceac0be --- /dev/null +++ b/dnn/src/cuda/flip/opr_impl.cpp @@ -0,0 +1,90 @@ +/** + * \file dnn/src/cuda/flip/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./flip.cuh" +#include "./opr_impl.h" + +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" +#include "src/common/utils.h" +#include + +namespace megdnn { +namespace cuda { + +namespace flip_intl { + +template +void flip_exec(const ctype *src, ctype *dst, size_t N, size_t IH, size_t IW, + size_t IC, size_t stride1, size_t stride2, size_t stride3, + bool vertical, bool horizontal, + cudaStream_t stream) { + if (vertical) { + if (horizontal) { + flip::flip(src, dst, N, IH, IW, IC, stride1, + stride2, stride3, stream); + } else { + flip::flip(src, dst, N, IH, IW, IC, stride1, + stride2, stride3, stream); + } + } else { + if (horizontal) { + flip::flip(src, dst, N, IH, IW, IC, stride1, + stride2, stride3, stream); + } else { + flip::flip(src, dst, N, IH, IW, IC, stride1, + stride2, stride3, stream); + } + } +} + +} // namespace flip_intl + +void FlipImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, + _megdnn_workspace workspace) { + check_exec(src.layout, dst.layout, workspace.size); + auto stream = cuda_stream(handle()); + //! src layout is the same as dst layout + size_t N = src.layout.shape[0]; + size_t batch_size = 0; + +#define cb(DType) \ + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + ctype* src_ptr = src.ptr() + curr_batch * src.layout.stride[0]; \ + ctype* dst_ptr = dst.ptr() + curr_batch * src.layout.stride[0]; \ + batch_size = std::min(N - curr_batch, max_batch); \ + flip_intl::flip_exec(src_ptr, dst_ptr, batch_size, \ + src.layout.shape[1], src.layout.shape[2], \ + src.layout.shape[3], src.layout.stride[0], \ + src.layout.stride[1], \ + src.layout.stride[2], param().vertical, \ + param().horizontal, stream); \ + } + + size_t curr_batch = 0; + size_t max_batch = max_batch_x_channel_size(); + if (N <= max_batch) { + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + } else { + while (curr_batch < N) { + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + + curr_batch += max_batch; + } + } +#undef cb +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/flip/opr_impl.h b/dnn/src/cuda/flip/opr_impl.h new file mode 100644 index 00000000..ee659c4a --- /dev/null +++ b/dnn/src/cuda/flip/opr_impl.h @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/flip/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class FlipImpl : public Flip { + public: + using Flip::Flip; + + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/fp16_help.cuh b/dnn/src/cuda/fp16_help.cuh new file mode 100644 index 00000000..29c2bc0a --- /dev/null +++ b/dnn/src/cuda/fp16_help.cuh @@ -0,0 +1,58 @@ +/** + * \file dnn/src/cuda/fp16_help.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include "cuda.h" +#include "cuda_fp16.h" + +namespace megdnn { +namespace cuda { + +__device__ __forceinline__ float fma(const float a, const float b, + const float c) { + return a * b + c; +} + +__device__ __forceinline__ float2 fma2(const float2 a, const float2 b, + const float2 c) { + return {a.x * b.x + c.x, a.y * b.y + c.y}; +} + +#if CUDA_VERSION >= 9000 + +__device__ __forceinline__ __half fma(const __half a, const __half b, + const __half c) { +#if __CUDA_ARCH__ >= 530 + return __hfma(a, b, c); +#else + return __float2half(__half2float(a) * __half2float(b) + __half2float(c)); +#endif +} + +__device__ __forceinline__ __half2 fma2(const __half2 a, const __half2 b, + const __half2 c) { +#if __CUDA_ARCH__ >= 530 + return __hfma2(a, b, c); +#else + return {__float2half(__half2float(a.x) * __half2float(b.x) + + __half2float(c.x)), + __float2half(__half2float(a.y) * __half2float(b.y) + + __half2float(c.y))}; +#endif +} + +#endif // CUDA_VERSION >= 9000 + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/gaussian_blur/gaussian_blur.cu b/dnn/src/cuda/gaussian_blur/gaussian_blur.cu new file mode 100644 index 00000000..69e0a67f --- /dev/null +++ b/dnn/src/cuda/gaussian_blur/gaussian_blur.cu @@ -0,0 +1,308 @@ +/** + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. + * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. + * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. + * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. + * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. + * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. + * Copyright (C) 2019-2020, Xperience AI, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + * + * --------------------------------------------------------------------------- + * \file dnn/src/cuda/gaussian_blur/gaussian_blur.cu + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * + * --------------------------------------------------------------------------- + */ +#include "./gaussian_blur.cuh" + +#include "megdnn/dtype.h" +#include "src/cuda/cv/kernel_common.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +namespace { + +static const uint8_t BITS = 8; + +#define rep(i, n) for (size_t i = 0; i < (n); ++i) + +template +__global__ void prepare_kernel(uint8_t* kernel_ptr, size_t kernel_height, + size_t kernel_width, double sigma_x, + double sigma_y); + +template <> +__global__ void prepare_kernel(uint8_t* _kernel_ptr, + size_t kernel_height, size_t kernel_width, + double sigma_x, double sigma_y) { + float* kernel_ptr = reinterpret_cast(_kernel_ptr); + const int kSmallGaussianSize = 7; + const float small_gaussian_table[4][kSmallGaussianSize] = { + {1.f}, + {0.25f, 0.5f, 0.25f}, + {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, + {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, + 0.03125f}}; + + const float* fixed_kernel_x = + (kernel_width % 2 == 1 && kernel_width <= kSmallGaussianSize && + sigma_x <= 0) + ? small_gaussian_table[kernel_width >> 1] + : NULL; + const float* fixed_kernel_y = + (kernel_height % 2 == 1 && kernel_height <= kSmallGaussianSize && + sigma_y <= 0) + ? small_gaussian_table[kernel_height >> 1] + : NULL; + sigma_x = + sigma_x > 0 ? sigma_x : ((kernel_width - 1) * 0.5 - 1) * 0.3 + 0.8; + double scale_2x = -0.5 / (sigma_x * sigma_x); + sigma_y = + sigma_y > 0 ? sigma_y : ((kernel_height - 1) * 0.5 - 1) * 0.3 + 0.8; + double scale_2y = -0.5 / (sigma_y * sigma_y); + + //! calc gaussian kernel + double sum = 0; + rep(iy, kernel_height) { + double y = iy - (kernel_height - 1) * 0.5; + double ky = fixed_kernel_y ? static_cast(fixed_kernel_y[iy]) + : std::exp(scale_2y * y * y); + rep(ix, kernel_width) { + double x = ix - (kernel_width - 1) * 0.5; + double kx = fixed_kernel_x ? static_cast(fixed_kernel_x[ix]) + : std::exp(scale_2x * x * x); + + float kxy = static_cast(kx * ky); + kernel_ptr[iy * kernel_width + ix] = kxy; + sum += kxy; + } + } + + //! normalize + sum = 1. / sum; + rep(i, kernel_width * kernel_height) { + kernel_ptr[i] = static_cast(sum * kernel_ptr[i]); + } +} + +template <> +__global__ void prepare_kernel(uint8_t* _kernel_ptr, + size_t kernel_height, + size_t kernel_width, double sigma_x, + double sigma_y) { + int32_t* kernel_ptr = reinterpret_cast(_kernel_ptr); + const int kSmallGaussianSize = 7; + const float small_gaussian_table[4][kSmallGaussianSize] = { + {1.f}, + {0.25f, 0.5f, 0.25f}, + {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, + {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, + 0.03125f}}; + + const float* fixed_kernel_x = + (kernel_width % 2 == 1 && kernel_width <= kSmallGaussianSize && + sigma_x <= 0) + ? small_gaussian_table[kernel_width >> 1] + : NULL; + const float* fixed_kernel_y = + (kernel_height % 2 == 1 && kernel_height <= kSmallGaussianSize && + sigma_y <= 0) + ? small_gaussian_table[kernel_height >> 1] + : NULL; + sigma_x = + sigma_x > 0 ? sigma_x : ((kernel_width - 1) * 0.5 - 1) * 0.3 + 0.8; + double scale_2x = -0.5 / (sigma_x * sigma_x); + sigma_y = + sigma_y > 0 ? sigma_y : ((kernel_height - 1) * 0.5 - 1) * 0.3 + 0.8; + double scale_2y = -0.5 / (sigma_y * sigma_y); + + size_t kernel_size = kernel_width * kernel_height; + + //! calc the sum of horizontal kernel filter + double sum_y = 0; + float* ky_ptr = reinterpret_cast(kernel_ptr + kernel_size); + rep(iy, kernel_height) { + double y = iy - (kernel_height - 1) * 0.5; + double ky = fixed_kernel_y ? static_cast(fixed_kernel_y[iy]) + : std::exp(scale_2y * y * y); + sum_y += ky; + ky_ptr[iy] = static_cast(ky); + } + sum_y = 1 / sum_y; + + //! calc the sum of vertical kernel filter + double sum_x = 0; + float* kx_ptr = + reinterpret_cast(kernel_ptr + kernel_size) + kernel_height; + rep(ix, kernel_width) { + double x = ix - (kernel_width - 1) * 0.5; + double kx = fixed_kernel_x ? static_cast(fixed_kernel_x[ix]) + : std::exp(scale_2x * x * x); + sum_x += kx; + kx_ptr[ix] = static_cast(kx); + } + sum_x = 1 / sum_x; + + rep(iy, kernel_height) { + float ky = ky_ptr[iy]; + int ky_int = (ky * sum_y * (1 << BITS)); + rep(ix, kernel_width) { + float kx = kx_ptr[ix]; + + int kx_int = (kx * sum_x * (1 << BITS)); + kernel_ptr[iy * kernel_width + ix] = kx_int * ky_int; + } + } +} + +template +__global__ void gaussian_blur_kern(const T* src, T* dst, size_t N, size_t H, + size_t W, size_t stride0, size_t stride1, + size_t stride2, size_t stride3, + uint8_t* kernel_ptr, size_t kernel_height, + size_t kernel_width) { + int iw = blockIdx.x * blockDim.x + threadIdx.x; + int ih = blockIdx.y * blockDim.y + threadIdx.y; + if (iw < W && ih < H) { +#pragma unroll + rep(c, CH) { + double val = 0; + rep(iy, kernel_height) { + int y = megcv::border_interpolate( + ih + iy - kernel_height / 2, H); + rep(ix, kernel_width) { + int x = megcv::border_interpolate( + iw + ix - kernel_width / 2, W); + + //! BORDER_CONSTANT or BORDER_TRANSPARENT + if (x != -1 && y != -1) { + if (is_same::value) { + val += static_cast(reinterpret_cast( + kernel_ptr)[iy * kernel_width + + ix]) * + src[blockIdx.z * stride0 + y * stride1 + + x * stride2 + c * stride3]; + } else { + val += static_cast(reinterpret_cast( + kernel_ptr)[iy * kernel_width + + ix]) * + src[blockIdx.z * stride0 + y * stride1 + + x * stride2 + c * stride3]; + } + } + } + } + + if (is_same::value) { + dst[blockIdx.z * stride0 + ih * stride1 + iw * stride2 + + c * stride3] = + static_cast(static_cast(val) >> (2 * BITS)); + } else { + //! float32 + dst[blockIdx.z * stride0 + ih * stride1 + iw * stride2 + + c * stride3] = static_cast(val); + } + } + } +} + +#undef rep +} // namespace + +namespace gaussian_blur { + +template +void gaussian_blur(const T* src, T* dst, size_t N, size_t H, size_t W, + size_t stride0, size_t stride1, size_t stride2, + size_t stride3, uint8_t* kernel_ptr, size_t kernel_height, + size_t kernel_width, double sigma_x, double sigma_y, + cudaStream_t stream) { + //! calc gaussian kernel + prepare_kernel<<<1, 1, 0, stream>>>(kernel_ptr, kernel_height, + kernel_width, sigma_x, sigma_y); + cuda_check(cudaStreamSynchronize(stream)); + + static const int BX = 16; + static const int BY = 16; + dim3 threads(BX, BY); + dim3 blocks(DIVUP(W, BX), DIVUP(H, BY), N); + gaussian_blur_kern<<>>( + src, dst, N, H, W, stride0, stride1, stride2, stride3, kernel_ptr, + kernel_height, kernel_width); + after_kernel_launch(); +} + +#define INST(T, CH, bmode) \ + template void gaussian_blur( \ + const T* src, T* dst, size_t N, size_t H, size_t W, \ + size_t stride0, size_t stride1, size_t stride2, size_t stride3, \ + uint8_t*, size_t, size_t, double, double, cudaStream_t); + +#define cb(DType) \ + INST(typename DTypeTrait::ctype, 1, BORDER_REPLICATE) \ + INST(typename DTypeTrait::ctype, 3, BORDER_REPLICATE) \ + INST(typename DTypeTrait::ctype, 1, BORDER_REFLECT) \ + INST(typename DTypeTrait::ctype, 3, BORDER_REFLECT) \ + INST(typename DTypeTrait::ctype, 1, BORDER_REFLECT_101) \ + INST(typename DTypeTrait::ctype, 3, BORDER_REFLECT_101) \ + INST(typename DTypeTrait::ctype, 1, BORDER_CONSTANT) \ + INST(typename DTypeTrait::ctype, 3, BORDER_CONSTANT) + +cb(dtype::Uint8); +cb(dtype::Float32); + +#undef cb +#undef INST + +} // namespace gaussian_blur +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/gaussian_blur/gaussian_blur.cuh b/dnn/src/cuda/gaussian_blur/gaussian_blur.cuh new file mode 100644 index 00000000..116ce46e --- /dev/null +++ b/dnn/src/cuda/gaussian_blur/gaussian_blur.cuh @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/gaussian_blur/gaussian_blur.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include +#include "src/common/cv/enums.h" + +#include + +namespace megdnn { +namespace cuda { +namespace gaussian_blur { + +template +void gaussian_blur(const T* src, T* dst, size_t N, size_t H, size_t W, + size_t stride0, size_t stride1, size_t stride2, + size_t stride3, uint8_t* kernel_ptr, size_t kernel_height, + size_t kernel_width, double sigma_x, double sigma_y, + cudaStream_t stream); + +} // namespace gaussian_blur +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/gaussian_blur/opr_impl.cpp b/dnn/src/cuda/gaussian_blur/opr_impl.cpp new file mode 100644 index 00000000..49d7501a --- /dev/null +++ b/dnn/src/cuda/gaussian_blur/opr_impl.cpp @@ -0,0 +1,115 @@ +/** + * \file dnn/src/cuda/gaussian_blur/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./gaussian_blur.cuh" + +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" +#include "src/common/cv/common.h" +#include "src/common/cv/enums.h" +#include "src/common/cv/filter.h" +#include "src/common/utils.h" + +#include + +namespace megdnn { +namespace cuda { + +namespace intl { + +template +void gaussian_blur_exec(const ctype* src, ctype* dst, size_t N, size_t IH, + size_t IW, size_t IC, size_t stride0, size_t stride1, + size_t stride2, size_t stride3, + uint8_t* kernel_ptr, size_t kernel_height, + size_t kernel_width, double sigma_x, double sigma_y, + param::GaussianBlur::BorderMode bmode, + cudaStream_t stream) { + megdnn_assert(IC == 1_z || IC == 3_z); +#define INIT_KERN(bmode) \ + if (IC == 1) { \ + gaussian_blur::gaussian_blur( \ + src, dst, N, IH, IW, stride0, stride1, stride2, stride3, \ + kernel_ptr, kernel_height, kernel_width, sigma_x, sigma_y, \ + stream); \ + } else { \ + gaussian_blur::gaussian_blur( \ + src, dst, N, IH, IW, stride0, stride1, stride2, stride3, \ + kernel_ptr, kernel_height, kernel_width, sigma_x, sigma_y, \ + stream); \ + } + + switch (bmode) { + case param::GaussianBlur::BorderMode::BORDER_REPLICATE: + INIT_KERN(BORDER_REPLICATE); + break; + case param::GaussianBlur::BorderMode::BORDER_REFLECT: + INIT_KERN(::BorderMode::BORDER_REFLECT); + break; + case param::GaussianBlur::BorderMode::BORDER_REFLECT_101: + INIT_KERN(::BorderMode::BORDER_REFLECT_101); + break; + case param::GaussianBlur::BorderMode::BORDER_CONSTANT: + INIT_KERN(::BorderMode::BORDER_CONSTANT); + break; + default: + MegCVException("Unsupport Bordermode in GaussianBlur\n"); + } +} + +} // namespace intl + +void GaussianBlurImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst, + _megdnn_workspace workspace) { + megdnn_assert(src.layout.dtype == dtype::Uint8() || + src.layout.dtype == dtype::Float32()); + check_exec(src.layout, dst.layout, workspace.size); + + auto stream = cuda_stream(handle()); + //! src layout is the same as dst layout + size_t N = src.layout.shape[0]; + size_t batch_size = 0; +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + ctype* src_ptr = src.ptr() + curr_batch * src.layout.stride[0]; \ + ctype* dst_ptr = dst.ptr() + curr_batch * src.layout.stride[0]; \ + batch_size = std::min(N - curr_batch, max_batch_x_channel); \ + intl::gaussian_blur_exec( \ + src_ptr, dst_ptr, batch_size, src.layout.shape[1], \ + src.layout.shape[2], src.layout.shape[3], \ + src.layout.stride[0], src.layout.stride[1], \ + src.layout.stride[2], src.layout.stride[3], \ + workspace.ptr(), m_kernel_height, m_kernel_width, \ + m_sigma_x, m_sigma_y, param().border_mode, stream); \ + } + + size_t max_batch_x_channel = max_batch_x_channel_size(); + size_t curr_batch = 0; + if (N <= max_batch_x_channel) { + cb(dtype::Uint8); + cb(dtype::Float32); + } else { + while (curr_batch < N) { + cb(dtype::Uint8); + cb(dtype::Float32); + + curr_batch += max_batch_x_channel; + } + } +#undef cb +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/gaussian_blur/opr_impl.h b/dnn/src/cuda/gaussian_blur/opr_impl.h new file mode 100644 index 00000000..df472eaa --- /dev/null +++ b/dnn/src/cuda/gaussian_blur/opr_impl.h @@ -0,0 +1,92 @@ +/** + * \file dnn/src/cuda/gaussian_blur/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/common/cv/common.h" +#include + +namespace megdnn { +namespace cuda { + +class GaussianBlurImpl : public GaussianBlur { + public: + using GaussianBlur::GaussianBlur; + + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout&) override { + //! current only support float and uint8 + megdnn_assert(src.dtype == dtype::Float32() || + src.dtype == dtype::Uint8()); + + //! Calc gaussian kernel real size + double sigma_x = param().sigma_x; + double sigma_y = param().sigma_y; + uint32_t kernel_height = param().kernel_height; + uint32_t kernel_width = param().kernel_width; + + if (sigma_y <= 0) + sigma_y = sigma_x; + + auto get_size = [&src](double sigma) { + double num = 0; + if (src.dtype == dtype::Uint8()) { + num = sigma * 3 * 2 + 1; + } else { + num = sigma * 4 * 2 + 1; + } + return static_cast(num + (num >= 0 ? 0.5 : -0.5)) | 1; + }; + + if (kernel_width <= 0 && sigma_x > 0) { + m_kernel_width = get_size(sigma_x); + } else { + m_kernel_width = kernel_width; + } + if (kernel_height <= 0 && sigma_y > 0) { + m_kernel_height = get_size(sigma_y); + } else { + m_kernel_height = kernel_height; + } + megdnn_assert(m_kernel_width > 0 && m_kernel_width % 2 == 1 && + m_kernel_height > 0 && m_kernel_height % 2 == 1); + + m_sigma_x = std::max(sigma_x, 0.); + m_sigma_y = std::max(sigma_y, 0.); + + if (src.dtype == dtype::Uint8()) { + //! element [0, m_kernel_width * m_kernel_height - 1] store the + //! filter matrix of type int32_t, others store float value + //! kernel_x and kernel_y. + return m_kernel_width * m_kernel_height * sizeof(int32_t) + + (m_kernel_width + m_kernel_height) * sizeof(float); + } else { + //! float32 + return m_kernel_width * m_kernel_height * sizeof(float); + } + } + + private: + uint32_t m_kernel_height; + uint32_t m_kernel_width; + double m_sigma_x; + double m_sigma_y; + +}; // class GaussianBlurImpl + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/group_local/bwd_data.cpp b/dnn/src/cuda/group_local/bwd_data.cpp new file mode 100644 index 00000000..17646f55 --- /dev/null +++ b/dnn/src/cuda/group_local/bwd_data.cpp @@ -0,0 +1,96 @@ +/** + * \file dnn/src/cuda/group_local/bwd_data.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/group_local/opr_impl.h" + +#include "src/common/utils.h" +#include "src/cuda/local/local.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +void GroupLocalBackwardDataImpl::exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(filter.layout, diff.layout, grad.layout, workspace.size); + + auto G = filter.layout[0]; + auto N = grad.layout.shape[0], IC = grad.layout.shape[1]/G, + IH = grad.layout.shape[2], IW = grad.layout.shape[3], + OC = diff.layout.shape[1]/G, + OH = diff.layout.shape[2], OW = diff.layout.shape[3]; + auto FH = filter.layout.shape[4], FW = filter.layout.shape[5]; + auto PH = param().pad_h, PW = param().pad_w; + auto SH = param().stride_h, SW = param().stride_w; + float *sptr = grad.ptr(); + const float *fptr = filter.ptr(); + const float *dptr = diff.ptr(); + float *wptr = workspace.ptr(); + auto handle = concrete_handle(this->handle()); + auto stream = cuda_stream(this->handle()); + auto cublas = cublas_handle(this->handle()); + auto one = handle->one_device(); + auto zero = handle->zero_device(); + megdnn_assert(local::can_backward_data_proxy_convnet(N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW), + "Cannot do Group Local bwd data."); + for (size_t g = 0; g < G; ++g) { + local::backward_data_proxy_convnet(fptr + g*OH*OW*IC*FH*FW*OC, + dptr + g*OC*OH*OW, + sptr + g*IC*IH*IW, + wptr, + N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW, + cublas, stream, one, zero); + } +} + +GroupLocalBackwardDataImpl::GroupLocalBackwardDataImpl(Handle *handle): + GroupLocalBackwardData(handle) +{ +} + +size_t GroupLocalBackwardDataImpl::get_workspace_in_bytes( + const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) +{ + auto G = filter[0]; + auto N = grad.shape[0], IC = grad.shape[1]/G, + IH = grad.shape[2], IW = grad.shape[3], + OC = diff.shape[1]/G, + OH = diff.shape[2], OW = diff.shape[3]; + auto FH = filter.shape[4], FW = filter.shape[5]; + auto PH = param().pad_h, PW = param().pad_w; + auto SH = param().stride_h, SW = param().stride_w; + auto res = local::get_workspace_in_floats_backward_data_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW) * sizeof(float); + return res; +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/group_local/bwd_filter.cpp b/dnn/src/cuda/group_local/bwd_filter.cpp new file mode 100644 index 00000000..a2c69d74 --- /dev/null +++ b/dnn/src/cuda/group_local/bwd_filter.cpp @@ -0,0 +1,99 @@ +/** + * \file dnn/src/cuda/group_local/bwd_filter.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/group_local/opr_impl.h" + +#include "src/common/utils.h" + +#include "src/common/utils.h" +#include "src/cuda/local/local.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +void GroupLocalBackwardFilterImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(src.layout, diff.layout, grad.layout, workspace.size); + + auto G = grad.layout[0]; + auto N = src.layout.shape[0], IC = src.layout.shape[1]/G, + IH = src.layout.shape[2], IW = src.layout.shape[3], + OC = diff.layout.shape[1]/G, + OH = diff.layout.shape[2], OW = diff.layout.shape[3]; + auto FH = grad.layout.shape[4], FW = grad.layout.shape[5]; + auto PH = param().pad_h, PW = param().pad_w; + auto SH = param().stride_h, SW = param().stride_w; + const float *sptr = src.ptr(); + float *fptr = grad.ptr(); + const float *dptr = diff.ptr(); + float *wptr = workspace.ptr(); + auto handle = concrete_handle(this->handle()); + auto stream = cuda_stream(this->handle()); + auto cublas = cublas_handle(this->handle()); + auto one = handle->one_device(); + auto zero = handle->zero_device(); + megdnn_assert(local::can_backward_filter_proxy_convnet(N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW), + "Cannot do Group Local bwd filter."); + for (size_t g = 0; g < G; ++g) { + local::backward_filter_proxy_convnet(sptr + g*IC*IH*IW, + dptr + g*OC*OH*OW, + fptr + g*OH*OW*IC*FH*FW*OC, + wptr, + N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW, + cublas, stream, one, zero); + } +} + +GroupLocalBackwardFilterImpl::GroupLocalBackwardFilterImpl(Handle *handle): + GroupLocalBackwardFilter(handle) +{ +} + +size_t GroupLocalBackwardFilterImpl::get_workspace_in_bytes( + const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) +{ + auto G = grad[0]; + auto N = src.shape[0], IC = src.shape[1]/G, + IH = src.shape[2], IW = src.shape[3], + OC = diff.shape[1]/G, + OH = diff.shape[2], OW = diff.shape[3]; + auto FH = grad.shape[4], FW = grad.shape[5]; + auto PH = param().pad_h, PW = param().pad_w; + auto SH = param().stride_h, SW = param().stride_w; + auto res = local::get_workspace_in_floats_backward_filter_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW) * sizeof(float); + return res; +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/group_local/cuda_interface.cu b/dnn/src/cuda/group_local/cuda_interface.cu new file mode 100644 index 00000000..301b78a3 --- /dev/null +++ b/dnn/src/cuda/group_local/cuda_interface.cu @@ -0,0 +1,145 @@ +/** + * \file dnn/src/cuda/group_local/cuda_interface.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./cuda_interface.h" + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +// src layout is (N, G, IC, IH, IW) +// filter layout is (G, OH, OW, IC, FH, FW, OC) +// dst layout is (N, G, OC, OH, OW) +// NR_THREADS is 256 +// gridDim.z is G +// gridDim.y is OC*OH*OW/NR_THREADS +// gridDim.x is N/NB +// blockDim.x is NR_THREADS + +// INs and ONs are the stride on the src/dst batch size dim +// IC and OC are nr. channels per group + +// Each thread tackles with NB (actually NB_cur if non-multiple-of-NB N is considered). +// Let oid = blockIdx.y*NR_THREADS + threadIdx.x (global thread ID along block +// axis y), and we flatten (OC, OH, OW) into one dimension, then each thread +// calculates the answer at dst position (n, blockIdx.z, oid), where n ranges +// from blockDim.x*NB + 0 to blockDim.x*NB + (NB-1). +// IC is processed at stride of ICB. On entrance of each iteration of the loop, +// NB * ICB spatial src planes are loaded into shared memory (presumably src +// spatial size is small). +template +__global__ void forward_kernel(const float * __restrict__ src, + const float * __restrict__ filter, + float * __restrict__ dst, + uint32_t N, + uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t OC, uint32_t OH, uint32_t OW, + uint32_t FH, uint32_t FW, + uint32_t G, + uint32_t INs, uint32_t ONs, + uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW) +{ + // NB * ICB * sizeof(float) * IH * IW + extern __shared__ float shared_mem[]; + float *src_cache = shared_mem; + uint32_t tid = threadIdx.x; + uint32_t tstride = blockDim.x; + uint32_t oid = tid + blockIdx.y * tstride; + src += blockIdx.x*NB * INs + blockIdx.z*IC*IH*IW; + dst += blockIdx.x*NB * ONs + blockIdx.z*OC*OH*OW; + filter += blockIdx.z*OH*OW*IC*FH*FW*OC; + uint32_t op = oid / OC; + uint32_t oc = oid % OC; + uint32_t oh = op / OW; + uint32_t ow = op % OW; + float dst_reg[NB]; + for (uint32_t nb = 0; nb < NB; ++nb) dst_reg[nb] = 0.0f; + uint32_t NB_cur = min(N-blockIdx.x*NB, NB); + for (uint32_t ic = 0; ic < IC; ic += ICB) { + // read ICB-channel src + // (NB, ICB, IHs, IWs) + uint32_t ICB_cur = min(ICB, IC-ic); + for (uint32_t i = tid; i < NB_cur*ICB*IH*IW; i += tstride) { + uint32_t ip = i % (IH*IW); + uint32_t icb = i / (IH*IW) % ICB; + uint32_t nb = i / (IH*IW) / ICB; + src_cache[i] = + (icb < ICB_cur) * src[nb*INs + min(IC-1, (ic+icb))*IH*IW + ip]; + } + __syncthreads(); + if (oid < OC*OH*OW) + for (uint32_t fh = 0; fh < FH; ++fh) + { + uint32_t ih; + if (is_xcorr) ih = oh*SH + fh - PH; else ih = oh*SH + (FH-fh-1) - PH; + if (ih < IH) + for (uint32_t fw = 0; fw < FW; ++fw) + { + uint32_t iw; + if (is_xcorr) iw = ow*SW + fw - PW; else iw = ow*SW + (FW-fw-1) - PW; + if (iw < IW) + for (uint32_t icb = 0; icb < ICB_cur; ++icb) { + uint32_t fid = op*IC*FH*FW*OC + (ic+icb)*FH*FW*OC + + fh*FW*OC + fw*OC + oc; + float fval = filter[fid]; + float src_reg[NB]; +#pragma unroll + for (uint32_t nb = 0; nb < NB; ++nb) { + src_reg[nb] = src_cache[nb*ICB*IH*IW + icb*IH*IW + ih*IW + iw]; + } +#pragma unroll + for (uint32_t nb = 0; nb < NB; ++nb) { + dst_reg[nb] += src_reg[nb]*fval; + } + } + } + } + __syncthreads(); + } + if (oid < OC*OH*OW) { + for (uint32_t nb = 0; nb < NB_cur; ++nb) { + dst[nb*ONs + oc*OH*OW + op] = dst_reg[nb]; + } + } +} + +void run_inference_kernel(const float *src, const float *filter, float *dst, + float *wptr, + uint32_t N, uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t OC, uint32_t OH, uint32_t OW, + uint32_t FH, uint32_t FW, + uint32_t G, + uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, + cudaStream_t stream) +{ + MEGDNN_MARK_USED_VAR(wptr); + size_t threads = 256; + const size_t NB = 4, ICB = 4; + dim3 blocks = dim3(DIVUP(N, NB), DIVUP(OC*OH*OW, threads), G); + uint32_t INs = G*IC*IH*IW, ONs = G*OC*OH*OW; + forward_kernel<<>>(src, filter, dst, + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + G, + INs, ONs, + PH, PW, + SH, SW); + after_kernel_launch(); +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/group_local/cuda_interface.h b/dnn/src/cuda/group_local/cuda_interface.h new file mode 100644 index 00000000..bcd0c5f2 --- /dev/null +++ b/dnn/src/cuda/group_local/cuda_interface.h @@ -0,0 +1,31 @@ +/** + * \file dnn/src/cuda/group_local/cuda_interface.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include + +namespace megdnn { +namespace cuda { + +void run_inference_kernel(const float *src, const float *filter, float *dst, + float *wptr, + uint32_t N, uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t OC, uint32_t OH, uint32_t OW, + uint32_t FH, uint32_t FW, + uint32_t G, + uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW, + cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/group_local/fwd.cpp b/dnn/src/cuda/group_local/fwd.cpp new file mode 100644 index 00000000..f418be9a --- /dev/null +++ b/dnn/src/cuda/group_local/fwd.cpp @@ -0,0 +1,151 @@ +/** + * \file dnn/src/cuda/group_local/fwd.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/group_local/opr_impl.h" + +#include "src/common/utils.h" +#include "src/cuda/local/local.cuh" +#include "src/cuda/utils.h" + +#include "./cuda_interface.h" + +namespace megdnn { +namespace cuda { + +void GroupLocalForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + megdnn_assert(src.layout.dtype == dtype::Float32(), + "cuda do not support fp16 group local operator"); + check_exec(src.layout, filter.layout, dst.layout, workspace.size); + + auto G = filter.layout[0]; + auto N = src.layout.shape[0], IC = src.layout.shape[1]/G, + IH = src.layout.shape[2], IW = src.layout.shape[3], + OC = dst.layout.shape[1]/G, + OH = dst.layout.shape[2], OW = dst.layout.shape[3]; + auto FH = filter.layout.shape[4], FW = filter.layout.shape[5]; + auto PH = param().pad_h, PW = param().pad_w; + auto SH = param().stride_h, SW = param().stride_w; + const float *sptr = src.ptr(); + const float *fptr = filter.ptr(); + float *dptr = dst.ptr(); + float *wptr = workspace.ptr(); + auto handle = concrete_handle(this->handle()); + auto stream = cuda_stream(this->handle()); + auto cublas = cublas_handle(this->handle()); + auto one = handle->one_device(); + auto zero = handle->zero_device(); + if (prefer_inference_kernel(src.layout, filter.layout, dst.layout)) { + run_inference_kernel(sptr, fptr, dptr, wptr, + N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G, + PH, PW, + SH, SW, + stream + ); + } else if (local::can_forward_proxy_convnet(N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW)) + { + // use convnet + for (size_t g = 0; g < G; ++g) { + local::forward_proxy_convnet(sptr + g*IC*IH*IW, + fptr + g*OH*OW*IC*FH*FW*OC, + dptr + g*OC*OH*OW, + wptr, + N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW, + cublas, stream, one, zero); + } + } else { + local::check_input(N, IC, IH, IW, OC, OH, OW, FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW, + true); + // do not use convnet + for (size_t g = 0; g < G; ++g) { + local::forward_proxy_weiming(sptr + g*IC*IH*IW, + fptr + g*OH*OW*IC*FH*FW*OC, + dptr + g*OC*OH*OW, + N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW, + true, stream); + } + } +} + +GroupLocalForwardImpl::GroupLocalForwardImpl(Handle *handle): + GroupLocalForward(handle) +{ +} + +size_t GroupLocalForwardImpl::get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) +{ + auto G = filter[0]; + auto N = src.shape[0], IC = src.shape[1]/G, + IH = src.shape[2], IW = src.shape[3], + OC = dst.shape[1]/G, + OH = dst.shape[2], OW = dst.shape[3]; + auto FH = filter.shape[4], FW = filter.shape[5]; + auto PH = param().pad_h, PW = param().pad_w; + auto SH = param().stride_h, SW = param().stride_w; + if (prefer_inference_kernel(src, filter, dst)) { + return 0; + } else if (local::can_forward_proxy_convnet(N, IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW)) + { + auto res = local::get_workspace_in_floats_forward_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + G*IC*IH*IW, G*OC*OH*OW, + PH, PW, + SH, SW) * sizeof(float); + return res; + } else { + return 0; + } +} + +bool GroupLocalForwardImpl::prefer_inference_kernel(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) +{ + megdnn_ignore(filter); + megdnn_ignore(dst); + return src.shape[0] <= 8; +} + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/group_local/opr_impl.h b/dnn/src/cuda/group_local/opr_impl.h new file mode 100644 index 00000000..33dacf68 --- /dev/null +++ b/dnn/src/cuda/group_local/opr_impl.h @@ -0,0 +1,59 @@ +/** + * \file dnn/src/cuda/group_local/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs/nn.h" + +namespace megdnn { +namespace cuda { + +class GroupLocalForwardImpl: public GroupLocalForward { + public: + GroupLocalForwardImpl(Handle *handle); + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) override; + private: + bool prefer_inference_kernel(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst); +}; + +class GroupLocalBackwardDataImpl: public GroupLocalBackwardData { + public: + GroupLocalBackwardDataImpl(Handle *handle); + void exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) override; +}; + +class GroupLocalBackwardFilterImpl: public GroupLocalBackwardFilter { + public: + GroupLocalBackwardFilterImpl(Handle *handle); + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) override; +}; + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/handle.cpp b/dnn/src/cuda/handle.cpp new file mode 100644 index 00000000..bc909c95 --- /dev/null +++ b/dnn/src/cuda/handle.cpp @@ -0,0 +1,132 @@ +/** + * \file dnn/src/cuda/handle.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/handle_impl.h" +#include "src/common/version_symbol.h" + +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include +#include + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#define CUDNN_VERSION_STR STR(CUDNN_MAJOR) "." STR(CUDNN_MINOR) "." STR(CUDNN_PATCHLEVEL) + +#pragma message "compile with cuDNN " CUDNN_VERSION_STR " " + +static_assert(!(CUDNN_MAJOR == 5 && CUDNN_MINOR == 1), + "cuDNN 5.1.x series has bugs. Use 5.0.x instead."); + +#undef STR +#undef STR_HELPER + +namespace megdnn { +namespace cuda { + +HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): + HandleImplHelper(comp_handle, HandleType::CUDA) +{ + // Get megcore device handle + megcoreDeviceHandle_t dev_handle; + megcoreGetDeviceHandle(comp_handle, &dev_handle); + int dev_id; + megcoreGetDeviceID(dev_handle, &dev_id); + if (dev_id < 0) { + cuda_check(cudaGetDevice(&dev_id)); + } + m_device_id = dev_id; + cuda_check(cudaGetDeviceProperties(&m_device_prop, dev_id)); + // Get stream from MegCore computing handle. + megdnn_assert(CUDNN_VERSION == cudnnGetVersion(), + "cudnn version mismatch: compiled with %d; detected %zu at runtime", + CUDNN_VERSION, cudnnGetVersion()); +#if CUDA_VERSION >= 10010 + megdnn_assert(cublasLtGetVersion() >= 10010, + "cuda library version is too low to run cublasLt"); +#endif + cudnn_check(cudnnCreate(&m_cudnn_handle)); + cublas_check(cublasCreate(&m_cublas_handle)); +#if CUDA_VERSION >= 10010 + cublas_check(cublasLtCreate(&m_cublasLt_handle)); +#endif + megcore::getCUDAContext(comp_handle, &m_megcore_context); + + // Set stream for cuDNN and cublas handles. + cudnn_check(cudnnSetStream(m_cudnn_handle, stream())); + cublas_check(cublasSetStream(m_cublas_handle, stream())); + + // Note that all cublas scalars (alpha, beta) and scalar results such as dot + // output resides at device side. + cublas_check(cublasSetPointerMode(m_cublas_handle, + CUBLAS_POINTER_MODE_DEVICE)); + + // init const scalars + cuda_check(cudaMalloc(&m_const_scalars, sizeof(ConstScalars))); + ConstScalars const_scalars_val; + const_scalars_val.init(); + cuda_check(cudaMemcpyAsync(m_const_scalars, &const_scalars_val, + sizeof(ConstScalars), cudaMemcpyHostToDevice, stream())); + cuda_check(cudaStreamSynchronize(stream())); + + // check tk1 + m_is_tegra_k1 = (strcmp(m_device_prop.name, "GK20A") == 0); + m_cusolver_handle = nullptr; +} + +HandleImpl::~HandleImpl() noexcept { + cudnn_check(cudnnDestroy(m_cudnn_handle)); + cublas_check(cublasDestroy(m_cublas_handle)); +#if CUDA_VERSION >= 10010 + cublas_check(cublasLtDestroy(m_cublasLt_handle)); +#endif + if (m_cusolver_handle) { + cusolver_check(cusolverDnDestroy(m_cusolver_handle)); + } + cuda_check(cudaFree(m_const_scalars)); +} + +void HandleImpl::ConstScalars::init() { + f16[0].megdnn_x = 0; f16[1].megdnn_x = 1; + f32[0] = 0; f32[1] = 1; + i32[0] = 0; i32[1] = 1; +} + +size_t HandleImpl::alignment_requirement() const { + auto &&prop = m_device_prop; + return std::max(prop.textureAlignment, prop.texturePitchAlignment); +} + +bool HandleImpl::check_cross_dev_copy_constraint(const TensorLayout& src) { + // is contiguous or can be hold by + // relayout::param::try_copy_2d/try_copy_last_contig + return src.is_contiguous() || src.stride[src.ndim - 1] == 1; +} + +void HandleImpl::initialize_cusolver() { + cusolver_check(cusolverDnCreate(&m_cusolver_handle)); + cusolver_check(cusolverDnSetStream(m_cusolver_handle, stream())); +} + +size_t HandleImpl::image2d_pitch_alignment() const { + size_t align = device_prop().texturePitchAlignment; + return align; +} + +} // namespace cuda +} // namespace megdnn + +MEGDNN_VERSION_SYMBOL(CUDA, CUDA_VERSION); +MEGDNN_VERSION_SYMBOL3(CUDNN, CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL); + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/handle.h b/dnn/src/cuda/handle.h new file mode 100644 index 00000000..9aa6fdfb --- /dev/null +++ b/dnn/src/cuda/handle.h @@ -0,0 +1,164 @@ +/** + * \file dnn/src/cuda/handle.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megcore_cuda.h" +#include "megdnn/basic_types.h" +#include "megdnn/handle.h" +#include "megdnn/oprs/general.h" + +#include "src/common/utils.h" +#include "src/common/handle_impl.h" +#include "src/cuda/cudnn_with_check.h" + +#include +#include +#include +#include +#include + +#include +#if CUDA_VERSION >= 10010 +#include +#endif + +namespace megdnn { +namespace cuda { + +class HandleImpl: public HandleImplHelper { + public: + HandleImpl(megcoreComputingHandle_t computing_handle); + ~HandleImpl() noexcept; + + size_t alignment_requirement() const override; + + bool check_cross_dev_copy_constraint(const TensorLayout &src) override; + + const cudaDeviceProp& device_prop() const { + return m_device_prop; + } + + template + std::unique_ptr create_operator(); + + const megcore::CudaContext& megcore_context() const { + return m_megcore_context; + } + + int device_id() const { return m_device_id; } + + cudaStream_t stream() const { + return megcore_context().stream; + } + cudnnHandle_t cudnn_handle() { + return m_cudnn_handle; + } + cublasHandle_t cublas_handle() { + return m_cublas_handle; + } +#if CUDA_VERSION >= 10010 + cublasLtHandle_t cublasLt_handle() { + return m_cublasLt_handle; + } +#endif + cusolverDnHandle_t cusolver_handle() { + std::call_once(m_cusolver_initialized, + [this] { initialize_cusolver(); }); + return m_cusolver_handle; + } + dt_float32 *zero_device() { + return &m_const_scalars->f32[0]; + } + dt_float32 *one_device() { + return &m_const_scalars->f32[1]; + } + __half* zero_device_h() { + return &m_const_scalars->f16[0].cuda_x; + } + __half* one_device_h() { + return &m_const_scalars->f16[1].cuda_x; + } + dt_int32 *zero_device_i32() { + return &m_const_scalars->i32[0]; + } + dt_int32 *one_device_i32() { + return &m_const_scalars->i32[1]; + } + + bool is_tegra_k1() const { + return m_is_tegra_k1; + } + + //! global matmul opr + MatrixMul* matmul_opr() override final { + return get_helper_opr(this); + } + + //! global matmul opr with first operand transposed + MatrixMul* matmul_aT_opr() override final { + return get_helper_opr(this, {true, false}); + } + + //! global matmul opr with second operand transposed + MatrixMul* matmul_bT_opr() override final { + return get_helper_opr(this, {false, true}); + } + + //! global relayout opr + Relayout* relayout_opr() override final { + return get_helper_opr(this); + } + + BatchedMatrixMulForward* batched_matrix_mul() { + return get_helper_opr(this); + } + + TypeCvt* typecvt_opr() { return get_helper_opr(this); } + + size_t image2d_pitch_alignment() const override; + private: + bool m_is_tegra_k1; + int m_device_id; + //! MegDNN handle does not manage the lifetime of CUDA stream. + megcore::CudaContext m_megcore_context; + + cudnnHandle_t m_cudnn_handle; + cublasHandle_t m_cublas_handle; +#if CUDA_VERSION >= 10010 + cublasLtHandle_t m_cublasLt_handle; +#endif + cusolverDnHandle_t m_cusolver_handle; + std::once_flag m_cusolver_initialized; + + cudaDeviceProp m_device_prop; + + struct ConstScalars { + union FP16 { + __half cuda_x; + dt_float16 megdnn_x; + FP16() {} + }; + static_assert(sizeof(FP16) == 2, "bad FP16 size"); + FP16 f16[2]; + dt_float32 f32[2]; + dt_int32 i32[2]; + void init(); + }; + + //! device ptr to const scalars + ConstScalars* m_const_scalars; + + void initialize_cusolver(); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/handle_create.cpp b/dnn/src/cuda/handle_create.cpp new file mode 100644 index 00000000..890a21ce --- /dev/null +++ b/dnn/src/cuda/handle_create.cpp @@ -0,0 +1,86 @@ +/** + * \file dnn/src/cuda/handle_create.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/handle_impl.h" + +#include "src/cuda/add_update/opr_impl.h" +#include "src/cuda/argmxx/opr_impl.h" +#include "src/cuda/argsort/opr_impl.h" +#include "src/cuda/batch_normalization/opr_impl.h" +#include "src/cuda/batched_matrix_mul/opr_impl.h" +#include "src/cuda/checksum/opr_impl.h" +#include "src/cuda/concat/opr_impl.h" +#include "src/cuda/cond_take/opr_impl.h" +#include "src/cuda/conv_bias/opr_impl.h" +#include "src/cuda/convolution/opr_impl.h" +#include "src/cuda/convolution3d/opr_impl.h" +#include "src/cuda/convpooling/opr_impl.h" +#include "src/cuda/cumsum/opr_impl.h" +#include "src/cuda/cvt_color/opr_impl.h" +#include "src/cuda/deformable_conv/opr_impl.h" +#include "src/cuda/deformable_ps_roi_pooling/opr_impl.h" +#include "src/cuda/dot/opr_impl.h" +#include "src/cuda/elemwise/opr_impl.h" +#include "src/cuda/elemwise_multi_type/opr_impl.h" +#include "src/cuda/eye/opr_impl.h" +#include "src/cuda/flip/opr_impl.h" +#include "src/cuda/gaussian_blur/opr_impl.h" +#include "src/cuda/group_local/opr_impl.h" +#include "src/cuda/images2neibs/opr_impl.h" +#include "src/cuda/indexing_multi_axis_vec/opr_impl.h" +#include "src/cuda/indexing_one_hot/opr_impl.h" +#include "src/cuda/linspace/opr_impl.h" +#include "src/cuda/local/opr_impl.h" +#include "src/cuda/local_share/opr_impl.h" +#include "src/cuda/lrn/opr_impl.h" +#include "src/cuda/mask_conv/opr_impl.h" +#include "src/cuda/matrix_inverse/opr_impl.h" +#include "src/cuda/matrix_mul/opr_impl.h" +#include "src/cuda/max_tensor_diff/opr_impl.h" +#include "src/cuda/mesh_indexing/opr_impl.h" +#include "src/cuda/param_pack/opr_impl.h" +#include "src/cuda/pooling/opr_impl.h" +#include "src/cuda/powc/opr_impl.h" +#include "src/cuda/reduce/opr_impl.h" +#include "src/cuda/relayout/opr_impl.h" +#include "src/cuda/relayout_format/opr_impl.h" +#include "src/cuda/repeat/opr_impl.h" +#include "src/cuda/resize/opr_impl.h" +#include "src/cuda/rng/opr_impl.h" +#include "src/cuda/roi_copy/opr_impl.h" +#include "src/cuda/roi_pooling/opr_impl.h" +#include "src/cuda/rotate/opr_impl.h" +#include "src/cuda/separable_conv/opr_impl.h" +#include "src/cuda/separable_filter/opr_impl.h" +#include "src/cuda/sleep/opr_impl.h" +#include "src/cuda/split/opr_impl.h" +#include "src/cuda/svd/opr_impl.h" +#include "src/cuda/tensor_remap/opr_impl.h" +#include "src/cuda/tile/opr_impl.h" +#include "src/cuda/topk/opr_impl.h" +#include "src/cuda/transpose/opr_impl.h" +#include "src/cuda/type_cvt/opr_impl.h" +#include "src/cuda/warp_affine/opr_impl.h" +#include "src/cuda/warp_perspective/opr_impl.h" +#include "src/cuda/winograd_filter_preprocess/opr_impl.h" +#include "src/cuda/local_share/opr_impl.h" +#include "src/cuda/roi_align/opr_impl.h" +#include "src/cuda/batch_conv_bias/opr_impl.h" + +namespace megdnn { +namespace cuda { + +MEGDNN_FOREACH_OPR_CLASS(MEGDNN_SPECIALIZE_CREATE_OPERATOR) + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/images2neibs/kernel.cu b/dnn/src/cuda/images2neibs/kernel.cu new file mode 100644 index 00000000..0cc8f1f7 --- /dev/null +++ b/dnn/src/cuda/images2neibs/kernel.cu @@ -0,0 +1,130 @@ +/** + * \file dnn/src/cuda/images2neibs/kernel.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/images2neibs/kernel.cuh" + +#include "megdnn/dtype.h" +#include "src/cuda/utils.cuh" +#include + +namespace megdnn { +namespace cuda { +namespace images2neibs { + + +#define grid_y_max 512 + +template +__global__ void forward_kernel(const T *src, T *dst, + int N, int C, int IH, int IW, int OH, int OW, + int ph, int pw, int sh, int sw, int WH, int WW) +{ + int NC = N * C; + int WP = WH*WW; + for (int wp = threadIdx.x; wp < WP; wp += blockDim.x) { + int nc = blockIdx.y; + while (nc < NC) { + int wh = wp / WW; + int ww = wp % WW; + int op = threadIdx.y + blockIdx.x * blockDim.y; + if (op < OH * OW) { + int oh = op / OW; + int ow = op % OW; + int ih = -ph + sh * oh + wh; + int iw = -pw + sw * ow + ww; + int dst_pos = nc * OH * OW * WH * WW + op * WH * WW + wp; + int src_pos = nc * IH * IW + ih * IW + iw; + dst[dst_pos] = (ih >= 0 && ih < IH && iw >= 0 && iw < IW) + ? src[src_pos] + : 0.0f; + } + nc += grid_y_max; + } + } +} + +template +void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW, + int ph, int pw, int sh, int sw, int wh, int ww, + cudaStream_t stream) { + int spatial_size = OH * OW; + int kernel_size = wh * ww; + int tx = min(NR_THREADS, kernel_size); + int ty = NR_THREADS / tx; + megdnn_assert(ty > 0); + int bx = DIVUP(spatial_size, ty); + int by = N * C; + + forward_kernel<<>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw, + wh, ww); + after_kernel_launch(); +} + +#undef grid_y_max + +template +__global__ void backward_kernel(const T *diff, T *grad, + int N, int C, int IH, int IW, int OH, int OW, + int ph, int pw, int sh, int sw, int WH, int WW) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id < N*C*IH*IW) { + int nc = id / (IH*IW); + int ih = id % (IH*IW) / IW; + int iw = id % (IH*IW) % IW; + grad[nc*IH*IW + ih*IW + iw] = 0.0f; + int oh_max = min((ih+ph) / sh, OH-1); + int oh_min = max((ih+ph-(WH-1)+sh-1) / sh, 0); + int ow_max = min((iw+pw) / sw, OW-1); + int ow_min = max((iw+pw-(WW-1)+sw-1) / sw, 0); + for (int oh = oh_min; oh <= oh_max; ++oh) + for (int ow = ow_min; ow <= ow_max; ++ow) + { + int wh = ih+ph - sh*oh; + int ww = iw+pw - sw*ow; + grad[nc*IH*IW + ih*IW + iw] += + diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW + + wh*WW + ww]; + } + } +} + +template +void backward(const T *diff, T *grad, + int N, int C, int IH, int IW, int OH, int OW, + int ph, int pw, int sh, int sw, int wh, int ww, + cudaStream_t stream) +{ + int threads = NR_THREADS; + int blocks = DIVUP(N*C*IH*IW, threads); + backward_kernel<<>>(diff, grad, + N, C, IH, IW, OH, OW, + ph, pw, sh, sw, wh, ww); + after_kernel_launch(); +} + +#define INST(T) \ + template void forward(const T *, T *, int, int, int, int, int, int, \ + int, int, int, int, int, int, \ + cudaStream_t); \ + template void backward(const T *, T *, int, int, int, int, int, int, \ + int, int, int, int, int, int, \ + cudaStream_t); +#define cb(DType) \ + INST(DTypeTrait::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +} // namespace images2neibs +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/images2neibs/kernel.cuh b/dnn/src/cuda/images2neibs/kernel.cuh new file mode 100644 index 00000000..7d2c614e --- /dev/null +++ b/dnn/src/cuda/images2neibs/kernel.cuh @@ -0,0 +1,34 @@ +/** + * \file dnn/src/cuda/images2neibs/kernel.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include + +namespace megdnn { +namespace cuda { +namespace images2neibs { + +template +void forward(const T *src, T *dst, + int N, int C, int IH, int IW, int OH, int OW, + int ph, int pw, int sh, int sw, int wh, int ww, + cudaStream_t stream); + +template +void backward(const T *diff, T *grad, + int N, int C, int IH, int IW, int OH, int OW, + int ph, int pw, int sh, int sw, int wh, int ww, + cudaStream_t stream); + +} // namespace images2neibs +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/images2neibs/opr_impl.cpp b/dnn/src/cuda/images2neibs/opr_impl.cpp new file mode 100644 index 00000000..a8dee41b --- /dev/null +++ b/dnn/src/cuda/images2neibs/opr_impl.cpp @@ -0,0 +1,74 @@ +/** + * \file dnn/src/cuda/images2neibs/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/images2neibs/opr_impl.h" + +#include "src/cuda/utils.h" +#include "src/cuda/images2neibs/kernel.cuh" + +namespace megdnn { +namespace cuda { + +void Images2NeibsForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, workspace.size); + auto stream = cuda_stream(handle()); + int N = src.layout[0], C = src.layout[1], + IH = src.layout[2], IW = src.layout[3]; + int OH = dst.layout[2], OW = dst.layout[3]; + int ph = param().pad_h, pw = param().pad_w; + int sh = param().stride_h, sw = param().stride_w; + int wh = param().window_h, ww = param().window_w; +#define cb(DType) \ + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using T = DTypeTrait::ctype; \ + images2neibs::forward(src.ptr(), dst.ptr(), \ + N, C, IH, IW, OH, OW, \ + ph, pw, sh, sw, wh, ww, \ + stream); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb); +#undef cb + megdnn_assert_internal(0); +} + +void Images2NeibsBackwardImpl::exec(_megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(diff.layout, grad.layout, workspace.size); + auto stream = cuda_stream(handle()); + int N = grad.layout[0], C = grad.layout[1], + IH = grad.layout[2], IW = grad.layout[3]; + int OH = diff.layout[2], OW = diff.layout[3]; + int ph = param().pad_h, pw = param().pad_w; + int sh = param().stride_h, sw = param().stride_w; + int wh = param().window_h, ww = param().window_w; +#define cb(DType) \ + if (diff.layout.dtype == DType()) { \ + using T = DTypeTrait::ctype; \ + images2neibs::backward(diff.ptr(), grad.ptr(), \ + N, C, IH, IW, OH, OW, \ + ph, pw, sh, sw, wh, ww, \ + stream); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb); +#undef cb + megdnn_assert_internal(0); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/images2neibs/opr_impl.h b/dnn/src/cuda/images2neibs/opr_impl.h new file mode 100644 index 00000000..beefbd19 --- /dev/null +++ b/dnn/src/cuda/images2neibs/opr_impl.h @@ -0,0 +1,45 @@ +/** + * \file dnn/src/cuda/images2neibs/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include + +namespace megdnn { +namespace cuda { + +class Images2NeibsForwardImpl: public Images2NeibsForward { + public: + using Images2NeibsForward::Images2NeibsForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &) override { + return 0; + } +}; + +class Images2NeibsBackwardImpl: public Images2NeibsBackward { + public: + using Images2NeibsBackward::Images2NeibsBackward; + void exec(_megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern.cuh b/dnn/src/cuda/indexing_multi_axis_vec/kern.cuh new file mode 100644 index 00000000..4da3319c --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/kern.cuh @@ -0,0 +1,98 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/arch.h" +#include "src/cuda/utils.cuh" +#include "src/cuda/int_fastdiv.cuh" +#include "src/cuda/error_info.cuh" + +namespace megdnn { +namespace cuda { +namespace indexing_multi_axis_vec { + + //! AxisIndexer equiv in kernel + struct KAxisIndexer { + int stride; + const int *ptr; + }; + + //! param for gen_offset_base + template + struct GenOffsetBaseParam { + uint32_t size; //!< number of outputs; also size of each index + int *output; //!< output ptr + KAxisIndexer indexer[nidx]; + uint32_t data_shape[nidx]; + int data_stride[nidx]; + + void* error_tracker; + megcore::AsyncErrorInfo* error_info; + }; + + //! tensor layout for fast offset computing + template + struct FastLayout { + int stride[ndim]; +#ifdef WIN32 + Uint32Fastdiv shape[ndim]; +#else + Uint32Fastdiv shape[ndim - 1]; +#endif + }; + + //! param for apply_opr + template + struct ApplyOprParam { + uint32_t tot_size; //!< total output size + + //! offset array generated by gen_offset_base for first output axis + const int *offset_base; + ctype *data, *value; + + int idx_axis; + + int value_stride; + + //! iterate on value, with strides from corresponding axes on data + FastLayout value_ly_on_data; + }; + + //! generate offset bases for first axis in the output + template + void gen_offset_base(const GenOffsetBaseParam ¶m, + cudaStream_t stream); + + struct OprAtomicIncr { +#if MEGDNN_CC_CUDA + template + __device__ static void apply(ctype &data, ctype value) { + atomicAdd(&data, value); + } +#endif + }; + + /*! + * \brief forward kernel: copy data to value + * \tparam ndim numer of axes except axis_0 in data, + * range from 0 to max_ndim - 1 + */ + template + void apply_opr(const ApplyOprParam ¶m, + cudaStream_t stream); + +} // namespace indexing_multi_axis_vec +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_fwd.cu b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_fwd.cu new file mode 100644 index 00000000..2a17cfcf --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_fwd.cu @@ -0,0 +1,18 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_fwd.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + + +#include "src/common/indexing_multi_axis_vec_kdef.h" +#define KERN_APPLY_OPR_OPR ::megdnn::indexing_multi_axis_vec_kdef::OprFwd +#include "./kern_apply_opr_impl.cuinl" + +// vim: ft=cuda syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl new file mode 100644 index 00000000..a640d865 --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl @@ -0,0 +1,83 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#ifndef KERN_APPLY_OPR_OPR +#error "must define KERN_APPLY_OPR_OPR" +#endif + +#include "./kern.cuh" +#include "megdnn/internal/defs.h" +#include "megdnn/dtype.h" +#include "src/cuda/query_blocksize.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace indexing_multi_axis_vec; + +namespace { + template + __global__ void kapply_opr(ApplyOprParam param) { + + uint32_t oidx = threadIdx.x + blockDim.x * blockIdx.x; + if (oidx < param.tot_size) { + int offset = 0, coidx = oidx; + int all_ax_idx[ndim]; +#pragma unroll + for (int i = ndim - 1; i >= 0; -- i) { + int next_coidx, ax_idx; + if (i) { + next_coidx = coidx / param.value_ly_on_data.shape[i - 1]; + ax_idx = + coidx - + (next_coidx * + param.value_ly_on_data.shape[i - 1].divisor()); + coidx = next_coidx; + } else { + ax_idx = coidx; + } + offset += param.value_ly_on_data.stride[i] * ax_idx; + all_ax_idx[i] = ax_idx; + } + offset += param.offset_base[all_ax_idx[param.idx_axis]]; + Opr::apply( + param.data[offset], + param.value[oidx * param.value_stride]); + } + } +} + +template +void indexing_multi_axis_vec::apply_opr( + const ApplyOprParam ¶m, cudaStream_t stream) { + void (*kptr)(ApplyOprParam) = kapply_opr; + int bsize = query_blocksize_for_kernel(kptr); + (*kptr) <<>> (param); +} + +namespace megdnn { +namespace cuda { +namespace indexing_multi_axis_vec { + +#define INST(_ndim, _ctype) \ + template void apply_opr<_ctype, _ndim, KERN_APPLY_OPR_OPR> \ + (const ApplyOprParam<_ctype, _ndim>&, cudaStream_t); +#define cb0(_dtype) \ + MEGDNN_FOREACH_TENSOR_NDIM(INST, DTypeTrait<_dtype>::ctype) + MEGDNN_FOREACH_COMPUTING_DTYPE(cb0) +#undef cb0 +#undef INST + +} // namespace indexing_multi_axis_vec +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu new file mode 100644 index 00000000..02ba2927 --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + + +#include "megdnn/dtype.h" + +#if !MEGDNN_DISABLE_FLOAT16 +__device__ void atomicAdd(megdnn::dt_float16 *, megdnn::dt_float16) { + __trap(); + ((int*)0)[0] = 1; +} +#endif + +__device__ void atomicAdd(megdnn::dt_int8 *, megdnn::dt_int8) { + __trap(); + ((int*)0)[0] = 1; +} + +__device__ void atomicAdd(megdnn::dt_uint8 *, megdnn::dt_uint8) { + __trap(); + ((int*)0)[0] = 1; +} + +__device__ void atomicAdd(megdnn::dt_int16 *, megdnn::dt_int16) { + __trap(); + ((int*)0)[0] = 1; +} + +#define KERN_APPLY_OPR_OPR \ + ::megdnn::cuda::indexing_multi_axis_vec::OprAtomicIncr +#include "./kern_apply_opr_impl.cuinl" + +// vim: ft=cuda syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_set.cu b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_set.cu new file mode 100644 index 00000000..a004c829 --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_set.cu @@ -0,0 +1,18 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_set.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + + +#include "src/common/indexing_multi_axis_vec_kdef.h" +#define KERN_APPLY_OPR_OPR ::megdnn::indexing_multi_axis_vec_kdef::OprSet +#include "./kern_apply_opr_impl.cuinl" + +// vim: ft=cuda syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/kern_gen_offset_base.cu b/dnn/src/cuda/indexing_multi_axis_vec/kern_gen_offset_base.cu new file mode 100644 index 00000000..46db387a --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_gen_offset_base.cu @@ -0,0 +1,69 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/kern_gen_offset_base.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "megdnn/internal/defs.h" +#include "src/cuda/query_blocksize.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace indexing_multi_axis_vec; + +namespace { + template + __global__ void kgen_offset_base(GenOffsetBaseParam param) { + int oidx = threadIdx.x + blockDim.x * blockIdx.x; + if (oidx < param.size) { + int offset = 0; +#pragma unroll + for (int i = 0; i < nidx; ++ i) { + int data_idx = param.indexer[i].ptr[ + param.indexer[i].stride * oidx]; + data_idx += (data_idx < 0 ? param.data_shape[i] : 0); + if (static_cast(data_idx) >= param.data_shape[i]) { + // cast to uint32 to handle both negative and overflow + set_async_error_info(param.error_info, param.error_tracker, + "invalid advanced indexing: " + "indexer=%d idx=%d shape=%d", + i, data_idx, param.data_shape[i]); + data_idx = 0; + } + offset += data_idx * param.data_stride[i]; + } + param.output[oidx] = offset; + } + } +} + +template +void indexing_multi_axis_vec::gen_offset_base( + const GenOffsetBaseParam ¶m, cudaStream_t stream) { + void (*kptr)(GenOffsetBaseParam) = kgen_offset_base; + int bsize = query_blocksize_for_kernel(kptr); + (*kptr) <<>> (param); +} + +namespace megdnn { +namespace cuda { +namespace indexing_multi_axis_vec { + +#define INST(_n) \ + template void gen_offset_base( \ + const GenOffsetBaseParam<_n> &, cudaStream_t); + MEGDNN_FOREACH_TENSOR_NDIM(INST) +#undef INST + +} // namespace indexing_multi_axis_vec +} // namespace cuda +} // namespace megdnn + +// vim: ft=cuda syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp new file mode 100644 index 00000000..4e864905 --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp @@ -0,0 +1,212 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./kern.cuh" + +#include "src/cuda/utils.h" +#include "src/common/indexing_multi_axis_vec_kdef.h" + +using namespace megdnn; +using namespace cuda; +using namespace indexing_multi_axis_vec; + +namespace { + class ExecImplHelper { + template + void dispatch_gen_offset_base_nidx(); + + void dispatch_gen_offset_base(); + protected: + using IndexDesc = IndexingMultiAxisVec::IndexDesc; + using ExecInfo = IndexingMultiAxisVec::ExecInfo; + + cudaStream_t m_stream; + const TensorND * const m_data; + const TensorND * const m_value; + const IndexDesc * const m_index; + const ExecInfo* const m_exec_info; + int * const m_offset_base; + TensorLayout m_value_layout_on_data; + size_t m_idx_axis; + int m_value_stride; + + public: + ExecImplHelper(const TensorND &data, const TensorND &value, + const IndexDesc &index, const Workspace &workspace, + const ExecInfo &exec_info, cudaStream_t stream); + }; + + template + class ExecImpl : public ExecImplHelper { + + void dispatch_exec(); + + template + void dispatch_exec_ctype(); + + template + void dispatch_exec_ctype_ndim(); + + public: + using ExecImplHelper::ExecImplHelper; + + void operator() () { + dispatch_exec(); + after_kernel_launch(); + } + }; +} // anonymous namespace + +ExecImplHelper::ExecImplHelper(const TensorND &data, const TensorND &value, + const IndexDesc &index, const Workspace &workspace, + const ExecInfo &exec_info, cudaStream_t stream): + m_stream{stream}, m_data{&data}, m_value{&value}, m_index{&index}, + m_exec_info{&exec_info}, m_offset_base{workspace.ptr()} +{ + safe_size_in_kern(data.layout.total_nr_elems()); + dispatch_gen_offset_base(); + + std::tie(m_value_layout_on_data, m_idx_axis) = + IndexingMultiAxisVec::get_value_iter_optimized_layout( + data.layout, value.layout, index, exec_info.idx_axis); + m_value_stride = exec_info.value_stride; +} + +template +void ExecImplHelper::dispatch_gen_offset_base_nidx() { + + GenOffsetBaseParam param; + param.size = m_value->layout.shape[m_exec_info->idx_axis]; + param.output = m_offset_base; + param.error_tracker = m_exec_info->error_tracker; + param.error_info = m_exec_info->error_info; + for (int i = 0; i < nidx; ++ i) { + auto &&dst = param.indexer[i]; + auto &&src = m_index->operator[](i); + megdnn_assert(src.vec.layout.ndim == 1); + dst.stride = src.vec.layout.stride[0]; + if (src.vec.layout.shape[0] == 1) { + dst.stride = 0; + } + dst.ptr = src.vec.ptr(); + param.data_shape[i] = m_data->layout.shape[src.axis]; + param.data_stride[i] = m_data->layout.stride[src.axis]; + } + gen_offset_base(param, m_stream); +} + +void ExecImplHelper::dispatch_gen_offset_base() { + switch(m_index->size()) { +#define cb(_n) case _n: return dispatch_gen_offset_base_nidx<_n>(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + megdnn_throw("bad index size"); +} + +template +void ExecImpl::dispatch_exec() { + switch (m_data->layout.dtype.enumv()) { +#define cb(_dtype) \ + case DTypeTrait<_dtype>::enumv: \ + return dispatch_exec_ctype::ctype>(); + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + default: + megdnn_throw("bad dtype"); + } +} + +template +template +void ExecImpl::dispatch_exec_ctype() { + switch (m_value_layout_on_data.ndim) { +#define cb(_n) \ + case _n: return dispatch_exec_ctype_ndim(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + default: + megdnn_throw("bad data ndim"); + } +} + +template +template +void ExecImpl::dispatch_exec_ctype_ndim() { + ApplyOprParam param; + param.tot_size = safe_size_in_kern(m_value->layout.total_nr_elems()); + param.offset_base = m_offset_base; + param.data = m_data->ptr(); + param.value = m_value->ptr(); + param.idx_axis = m_idx_axis; + param.value_stride = m_value_stride; + for (int i = 0; i < ndim; ++ i) { + param.value_ly_on_data.stride[i] = m_value_layout_on_data.stride[i]; + if (i) { + param.value_ly_on_data.shape[i - 1] = + m_value_layout_on_data.shape[i]; + } + } + apply_opr(param, m_stream); +} + + +size_t IndexingMultiAxisVecImpl::get_workspace_in_bytes(size_t dst_idx_size) { + return dst_idx_size * sizeof(int); +} + +void IndexingMultiAxisVecImpl::exec( + _megdnn_tensor_in src, const IndexDesc &index, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + auto info = check_exec(src.layout, index, dst.layout, workspace.size); + info.error_tracker = m_error_tracker; + info.error_info = async_error_info(handle()); + ExecImpl{ + src, dst, index, workspace, info, cuda_stream(handle())}(); +} + +size_t IndexingSetMultiAxisVecImpl::get_workspace_in_bytes( + size_t value_idx_size) { + return value_idx_size * sizeof(int); +} + +void IndexingSetMultiAxisVecImpl::exec( + _megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc &index, _megdnn_workspace workspace) { + auto info = check_exec(data.layout, value.layout, index, workspace.size); + info.error_tracker = m_error_tracker; + info.error_info = async_error_info(handle()); + ExecImpl{ + data, value, index, workspace, info, cuda_stream(handle())}(); +} + +size_t IndexingIncrMultiAxisVecImpl::get_workspace_in_bytes( + size_t value_idx_size) { + return value_idx_size * sizeof(int); +} + +void IndexingIncrMultiAxisVecImpl::exec( + _megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc &index, _megdnn_workspace workspace) { + MEGDNN_INC_FLOAT16( + megdnn_assert(data.layout.dtype != dtype::Float16(), + "float16 incr on cuda currently not supported")); + auto info = check_exec(data.layout, value.layout, index, workspace.size); + info.error_tracker = m_error_tracker; + info.error_info = async_error_info(handle()); + ExecImpl{data, value, index, workspace, info, + cuda_stream(handle())}(); +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.h b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.h new file mode 100644 index 00000000..386c4214 --- /dev/null +++ b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.h @@ -0,0 +1,73 @@ +/** + * \file dnn/src/cuda/indexing_multi_axis_vec/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + + class IndexingMultiAxisVecImpl final: public IndexingMultiAxisVec { + void* m_error_tracker = nullptr; + + public: + using IndexingMultiAxisVec::IndexingMultiAxisVec; + + size_t get_workspace_in_bytes(size_t dst_idx_size) override; + + void exec(_megdnn_tensor_in src, const IndexDesc &index, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } + }; + + class IndexingSetMultiAxisVecImpl final: public IndexingSetMultiAxisVec { + void* m_error_tracker = nullptr; + + public: + using IndexingSetMultiAxisVec::IndexingSetMultiAxisVec; + + size_t get_workspace_in_bytes(size_t dst_idx_size) override; + + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc &index, + _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } + }; + + class IndexingIncrMultiAxisVecImpl final: public IndexingIncrMultiAxisVec { + void* m_error_tracker = nullptr; + + public: + using IndexingIncrMultiAxisVec::IndexingIncrMultiAxisVec; + + size_t get_workspace_in_bytes(size_t dst_idx_size) override; + + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc &index, + _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } + }; +} +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_one_hot/kern.cu b/dnn/src/cuda/indexing_one_hot/kern.cu new file mode 100644 index 00000000..34c84045 --- /dev/null +++ b/dnn/src/cuda/indexing_one_hot/kern.cu @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/indexing_one_hot/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "src/cuda/utils.cuh" +#include "src/cuda/elemwise_helper.cuh" + +namespace megdnn { +namespace cuda { + +#define cb(_dt) \ + typedef indexing_one_hot::OpGet::ctype, dt_int32> \ + OpGet##_dt; \ + typedef indexing_one_hot::OpSet::ctype, dt_int32> \ + OpSet##_dt; \ + INST_RUN_ELEMWISE(OpGet##_dt, void, 0); \ + INST_RUN_ELEMWISE(OpSet##_dt, void, 0); + + MEGDNN_FOREACH_DTYPE_NAME(cb) + MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) + +#undef cb + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen + diff --git a/dnn/src/cuda/indexing_one_hot/kern.cuh b/dnn/src/cuda/indexing_one_hot/kern.cuh new file mode 100644 index 00000000..c6d83a5b --- /dev/null +++ b/dnn/src/cuda/indexing_one_hot/kern.cuh @@ -0,0 +1,78 @@ +/** + * \file dnn/src/cuda/indexing_one_hot/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/error_info.cuh" +#include "src/cuda/int_fastdiv.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace indexing_one_hot { + +struct KernParam { + //! stride[axis], also prod(shape[axis+1:ndim]) + Uint32Fastdiv shape_lo; + //! stride[axis-1] + uint32_t stride_hi; + + //! max value that user provide index array can give + uint32_t max_mid_index; + void* error_tracker; + AsyncErrorInfo* error_info; + + template + __device__ uint32_t get_idx(uint32_t offset, const idx_type* idx) const { + uint32_t idx0, idx1, idx2; + idx0 = offset / shape_lo; + idx2 = offset - idx0 * shape_lo.divisor(); + idx1 = idx[offset]; + if (idx1 >= max_mid_index) { + set_async_error_info(error_info, error_tracker, + "invalid IndexingOneHot: " + "offset=%d idx0=%d indexer=%d idx2=%d", + offset, idx0, idx1, idx2); + idx1 = 0; + } + return idx0 * stride_hi + idx1 * shape_lo.divisor() + idx2; + } +}; + +template +struct OpGet { + const data_type* m_src; + const idx_type* m_idx; + data_type* m_dst; + KernParam m_param; + + __device__ void operator()(uint32_t offset) { + m_dst[offset] = m_src[m_param.get_idx(offset, m_idx)]; + } +}; + +template +struct OpSet { + data_type* m_data; + const idx_type* m_idx; + const data_type* m_sub; + KernParam m_param; + + __device__ void operator()(uint32_t offset) { + m_data[m_param.get_idx(offset, m_idx)] = m_sub[offset]; + } +}; + +} // namespace indexing_one_hot +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/indexing_one_hot/opr_impl.cpp b/dnn/src/cuda/indexing_one_hot/opr_impl.cpp new file mode 100644 index 00000000..758b2ddb --- /dev/null +++ b/dnn/src/cuda/indexing_one_hot/opr_impl.cpp @@ -0,0 +1,90 @@ +/** + * \file dnn/src/cuda/indexing_one_hot/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./kern.cuh" + +#include "src/cuda/utils.h" +#include "src/cuda/elemwise_helper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace indexing_one_hot; + +namespace { + + KernParam make_kern_param(const TensorLayout &layout, size_t axis) { + KernParam ret; + memset(&ret, 0, sizeof(ret)); + ret.shape_lo = layout.stride[axis]; + ret.stride_hi = axis > 0 ? layout.stride[axis - 1] : 1; + ret.max_mid_index = layout[axis]; + return ret; + } + +} // anonymous namespace + +void IndexingOneHotForwardImpl::exec( + _megdnn_tensor_in src, _megdnn_tensor_in index, + _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, index.layout, dst.layout, workspace.size); + ElemwiseOpParamN<0> ele_param{dst.layout.total_nr_elems()}; + auto kern_param = make_kern_param(src.layout, m_param.axis); + auto stream = cuda_stream(handle()); + kern_param.error_tracker = m_error_tracker; + kern_param.error_info = async_error_info(handle()); + +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + using Op = OpGet::ctype, dt_int32>; \ + Op op{src.ptr(), index.ptr(), dst.ptr(), \ + kern_param}; \ + return run_elemwise(ele_param, stream, op); \ + } + switch (src.layout.dtype.enumv()) { + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + default: + megdnn_throw(megdnn_mangle("bad dtype")); + } +#undef cb +} + +void IndexingSetOneHotForwardImpl::exec( + _megdnn_tensor_inout data, _megdnn_tensor_in index, + _megdnn_tensor_in sub, _megdnn_workspace workspace) { + check_exec(data.layout, index.layout, sub.layout, workspace.size); + + ElemwiseOpParamN<0> ele_param{sub.layout.total_nr_elems()}; + auto kern_param = make_kern_param(data.layout, m_param.axis); + auto stream = cuda_stream(handle()); + kern_param.error_tracker = m_error_tracker; + kern_param.error_info = async_error_info(handle()); + +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + using ctype = DTypeTrait<_dt>::ctype; \ + using Op = OpSet::ctype, dt_int32>; \ + Op op{data.ptr(), index.ptr(), sub.ptr(), \ + kern_param}; \ + return run_elemwise(ele_param, stream, op); \ + } + switch (data.layout.dtype.enumv()) { + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + default: + megdnn_throw(megdnn_mangle("bad dtype")); + } +#undef cb +} + +// vim: syntax=cpp.doxygen + + diff --git a/dnn/src/cuda/indexing_one_hot/opr_impl.h b/dnn/src/cuda/indexing_one_hot/opr_impl.h new file mode 100644 index 00000000..302a3247 --- /dev/null +++ b/dnn/src/cuda/indexing_one_hot/opr_impl.h @@ -0,0 +1,57 @@ +/** + * \file dnn/src/cuda/indexing_one_hot/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class IndexingOneHotForwardImpl final: public IndexingOneHotForward { + void* m_error_tracker = nullptr; + public: + using IndexingOneHotForward::IndexingOneHotForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in index, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &, + const TensorLayout &) override { + return 0; + } + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +class IndexingSetOneHotForwardImpl final: public IndexingSetOneHotForward { + void* m_error_tracker = nullptr; + public: + using IndexingSetOneHotForward::IndexingSetOneHotForward; + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in index, + _megdnn_tensor_in sub, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &, + const TensorLayout &) override { + return 0; + } + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +} +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/int_fastdiv.cpp b/dnn/src/cuda/int_fastdiv.cpp new file mode 100644 index 00000000..055622d0 --- /dev/null +++ b/dnn/src/cuda/int_fastdiv.cpp @@ -0,0 +1,59 @@ +/** + * \file dnn/src/cuda/int_fastdiv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + + +#include "src/cuda/int_fastdiv.cuh" +#include + +using namespace megdnn; +using namespace cuda; + +Uint32Fastdiv::Uint32Fastdiv() { + memset(this, 0, sizeof(Uint32Fastdiv)); +} + +Uint32Fastdiv& Uint32Fastdiv::operator = (uint32_t d) { + megdnn_assert(d); + m_divisor = d; + MEGDNN_CONSTEXPR uint32_t MAX_U32 = ~0u; + m_inc_dividend = 0; + m_divisor_is_not_1 = ~0u; + if (!(d & (d - 1))) { + // power of 2 + m_mul = 1u << 31; + int p = 0; + while ((1u << p) < d) + ++ p; + megdnn_assert((1u << p) == d); + m_shift = p ? p - 1 : 0; + if (d == 1) + m_divisor_is_not_1 = 0; + return *this; + } + auto n_bound = uint64_t(d / 2 + 1) * MAX_U32; + uint32_t shift = 32; + while ((1ull << shift) < n_bound) + ++ shift; + uint64_t mdst = 1ull << shift; + int64_t delta = d - mdst % d; + m_mul = mdst / d + 1; + if ((uint64_t)delta > d / 2) { + delta -= d; + -- m_mul; + m_inc_dividend = 1; + } + megdnn_assert((uint64_t)m_mul * d == mdst + delta); + megdnn_assert((uint64_t)std::abs(delta) * MAX_U32 < mdst); + m_shift = shift - 32; + return *this; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/int_fastdiv.cuh b/dnn/src/cuda/int_fastdiv.cuh new file mode 100644 index 00000000..ac12df09 --- /dev/null +++ b/dnn/src/cuda/int_fastdiv.cuh @@ -0,0 +1,204 @@ +/** + * \file dnn/src/cuda/int_fastdiv.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/cuda/utils.cuh" + +#include +#include + +namespace megdnn { +namespace cuda { + +/*! + * \brief fast division for uint32 + */ +class Uint32Fastdiv { + uint32_t m_mul, m_divisor, m_divisor_is_not_1, m_inc_dividend, m_shift; + + public: + Uint32Fastdiv(); + + Uint32Fastdiv(uint32_t d) { + operator=(d); + } + + //! set the divisor to be d + Uint32Fastdiv& operator = (uint32_t d); + + //! caller must ensure that dividend would not exceed this number + static MEGDNN_CONSTEXPR uint32_t MAX_DIVIDEND = ~0u - 1; + + __device__ __forceinline__ uint32_t divisor() const { + return m_divisor; + } + + __device__ __forceinline__ uint32_t divide(uint32_t dividend) const { + uint32_t + ans_for_one = dividend & ~m_divisor_is_not_1, + dfix = dividend + m_inc_dividend, +#if MEGDNN_CC_CUDA + hi32 = __umulhi(dfix, m_mul), +#else + hi32 = ((uint64_t)dfix * m_mul) >> 32, +#endif + ans = hi32 >> m_shift; + + return (ans & m_divisor_is_not_1) | ans_for_one; + } +}; + +static __forceinline__ __device__ uint32_t +operator / (uint32_t a, const Uint32Fastdiv &d) { + return d.divide(a); +} + +static __forceinline__ __device__ uint32_t +operator % (uint32_t a, const Uint32Fastdiv &d) { + return a - d.divisor() * d.divide(a); +} + +/*! + * \brief maintain (a + k * x) / b and (a + k * x) % b for x >= 0 + * \tparam need_quotient whether quotient need to be maintained + */ +template +class StridedDivSeq; + +template<> +class StridedDivSeq { + Uint32Fastdiv m_b; + + //! k % b + uint32_t m_kr; + + //! current (a + k * x) % b + uint32_t m_r; + + public: + void host_init(uint32_t k, uint32_t b) { + m_b = b; + m_kr = k % b; + } + + //! init to k == 0 + __device__ __forceinline__ void device_init(uint32_t a) { + m_r = a % m_b; + } + + //! perform x += 1 + __device__ __forceinline__ void next() { + uint32_t b = m_b.divisor(), + r1 = m_r + m_kr, + carry_mask = (r1 < b) - 1; + m_r = r1 - (b & carry_mask); + } + + //! current remainder + __device__ __forceinline__ uint32_t r() const { + return m_r; + } +}; + +template<> +class StridedDivSeq { + Uint32Fastdiv m_b; + + //! k / b, k % b + uint32_t m_kq, m_kr; + + //! current (a + k * x) / b and (a + k * x) % b + uint32_t m_q, m_r; + + public: + void host_init(uint32_t k, uint32_t b) { + m_b = b; + m_kq = k / b; + m_kr = k % b; + } + + //! init to k == 0 + __device__ __forceinline__ void device_init(uint32_t a) { + m_q = m_b.divide(a); + m_r = a - m_b.divisor() * m_q; + } + + //! perform x += 1 + __device__ __forceinline__ void next() { + uint32_t b = m_b.divisor(), + r1 = m_r + m_kr, + carry_mask = (r1 < b) - 1; + m_q += m_kq + (r1 >= b); + m_r = r1 - (b & carry_mask); + } + + //! current quotient + __device__ __forceinline__ uint32_t q() const { + return m_q; + } + + //! current remainder + __device__ __forceinline__ uint32_t r() const { + return m_r; + } +}; + +/*! + * \brief maintain (a + k * x) / b % c for x >= 0 + */ +class StridedDivSeq2 { + Uint32Fastdiv m_b, m_c; + + //! k / b, k % b, k / b % c + uint32_t m_qkb, m_rkb, m_rkbc; + + //! current (a + k * x) % b and (a + k * x) / b % c + uint32_t m_cur_rkb, m_cur_ans; + + public: + + void host_init(uint32_t k, uint32_t b, uint32_t c) { + m_b = b; + m_c = c; + m_qkb = k / b; + m_rkb = k % b; + m_rkbc = m_qkb % c; + } + + //! init to k == 0 + __device__ __forceinline__ void device_init(uint32_t a) { + uint32_t q = m_b.divide(a); + m_cur_rkb = a - m_b.divisor() * q; + m_cur_ans = q % m_c; + } + + //! perform x += 1 + __device__ __forceinline__ void next() { + uint32_t b = m_b.divisor(), + c = m_c.divisor(), + rkb = m_cur_rkb + m_rkb, + carry0 = (rkb < b) - 1, + next_ans = m_cur_ans + m_rkbc + (rkb >= b), + carry1 = (next_ans < c) - 1; + m_cur_rkb = rkb - (b & carry0); + m_cur_ans = next_ans - (c & carry1); + } + + __device__ __forceinline__ uint32_t get() const { + return m_cur_ans; + } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/linspace/linspace.cu b/dnn/src/cuda/linspace/linspace.cu new file mode 100644 index 00000000..dba450f6 --- /dev/null +++ b/dnn/src/cuda/linspace/linspace.cu @@ -0,0 +1,50 @@ +/** + * \file dnn/src/cuda/linspace/linspace.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/linspace/linspace.cuh" +#include "megdnn/dtype.h" +#include "src/cuda/utils.cuh" + +namespace { + +template +__global__ void kernel(T *dst, double start, double step, uint32_t n) +{ + uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < n) { + dst[i] = T(start + step*i); + } +} + +} // anonymous namespace + +namespace megdnn { +namespace cuda { +namespace linspace { + +template +void exec_internal(T *dst, double start, double step, size_t n, + cudaStream_t stream) +{ + uint32_t threads = NR_THREADS; + uint32_t blocks = DIVUP(n, threads); + kernel<<>>(dst, start, step, n); + after_kernel_launch(); +} + +#define INST(T) template void exec_internal(T *dst, \ + double start, double step, size_t n, cudaStream_t stream); +#define cb(DType) INST(typename DTypeTrait::ctype) +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + +} // namespace linspace +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/linspace/linspace.cuh b/dnn/src/cuda/linspace/linspace.cuh new file mode 100644 index 00000000..9398f986 --- /dev/null +++ b/dnn/src/cuda/linspace/linspace.cuh @@ -0,0 +1,25 @@ +/** + * \file dnn/src/cuda/linspace/linspace.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include + +namespace megdnn { +namespace cuda { +namespace linspace { + +template +void exec_internal(T *dst, double start, double step, size_t n, + cudaStream_t stream); + +} // namespace linspace +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/linspace/opr_impl.cpp b/dnn/src/cuda/linspace/opr_impl.cpp new file mode 100644 index 00000000..af796fa7 --- /dev/null +++ b/dnn/src/cuda/linspace/opr_impl.cpp @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/linspace/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/linspace/opr_impl.h" + +#include "src/cuda/linspace/linspace.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +void LinspaceImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) +{ + check_exec(dst.layout, workspace.size); + auto stream = cuda_stream(handle()); + auto n = dst.layout.total_nr_elems(); + auto step = (param().stop - param().start) / + std::max(static_cast(param().endpoint ? n-1 : n), 1.0); +#define cb(DType) \ + if (dst.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + linspace::exec_internal(dst.ptr(), \ + param().start, step, n, \ + stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/linspace/opr_impl.h b/dnn/src/cuda/linspace/opr_impl.h new file mode 100644 index 00000000..17ff525a --- /dev/null +++ b/dnn/src/cuda/linspace/opr_impl.h @@ -0,0 +1,28 @@ +/** + * \file dnn/src/cuda/linspace/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class LinspaceImpl final: public Linspace { + public: + using Linspace::Linspace; + void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &) override { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/local/backward_data.cpp b/dnn/src/cuda/local/backward_data.cpp new file mode 100644 index 00000000..15a724ac --- /dev/null +++ b/dnn/src/cuda/local/backward_data.cpp @@ -0,0 +1,120 @@ +/** + * \file dnn/src/cuda/local/backward_data.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local/opr_impl.h" + +#include "src/cuda/local/local.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { +namespace local { + +void boom_backward_data() +{ + megdnn_throw("Local bad param: cannot do backward_data by cuda_convnet"); +} + +} // namespace local +} // namespace cuda +} // namespace megdnn + +namespace megdnn { +namespace cuda { + +void LocalBackwardDataImpl::exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(filter.layout, diff.layout, grad.layout, workspace.size); + megdnn_assert(param().mode == Mode::CROSS_CORRELATION); + auto N = grad.layout.shape[0], + IC = grad.layout.shape[1], + IH = grad.layout.shape[2], + IW = grad.layout.shape[3]; + auto OC = diff.layout.shape[1], + OH = diff.layout.shape[2], + OW = diff.layout.shape[3]; + auto FH = filter.layout.shape[3], + FW = filter.layout.shape[4]; + auto handle = concrete_handle(this->handle()); + auto stream = cuda_stream(this->handle()); + auto cublas = cublas_handle(this->handle()); + auto one = handle->one_device(); + auto zero = handle->zero_device(); + if (use_cuda_convnet(filter.layout, diff.layout, grad.layout)) { + local::backward_data_proxy_convnet(filter.ptr(), + diff.ptr(), + grad.ptr(), + reinterpret_cast(workspace.raw_ptr), + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + param().pad_h, param().pad_w, + param().stride_h, param().stride_w, + cublas, stream, + one, zero); + } else { + local::boom_backward_data(); + } +} + +size_t LocalBackwardDataImpl::get_workspace_in_bytes(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) +{ + auto N = grad.shape[0], + IC = grad.shape[1], IH = grad.shape[2], IW = grad.shape[3], + OC = diff.shape[1], OH = diff.shape[2], OW = diff.shape[3], + FH = filter.shape[3], FW = filter.shape[4]; + auto PH = param().pad_h, PW = param().pad_w, + SH = param().stride_h, SW = param().stride_w; + size_t res = 0u; + if (use_cuda_convnet(filter, diff, grad)) { + res = local::get_workspace_in_floats_backward_data_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + PH, PW, + SH, SW) * sizeof(dt_float32); + } else { + local::boom_backward_data(); + } + return res; +} + +bool LocalBackwardDataImpl::use_cuda_convnet(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) +{ + auto N = grad.shape[0], + IC = grad.shape[1], IH = grad.shape[2], IW = grad.shape[3], + OC = diff.shape[1], OH = diff.shape[2], OW = diff.shape[3], + FH = filter.shape[3], FW = filter.shape[4]; + auto PH = param().pad_h, PW = param().pad_w, + SH = param().stride_h, SW = param().stride_w; + return local::can_backward_data_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + PH, PW, + SH, SW); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/backward_data.cu b/dnn/src/cuda/local/backward_data.cu new file mode 100644 index 00000000..47ad86c2 --- /dev/null +++ b/dnn/src/cuda/local/backward_data.cu @@ -0,0 +1,94 @@ +/** + * \file dnn/src/cuda/local/backward_data.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local/local.cuh" + +#include "src/cuda/utils.cuh" +#include "src/cuda/local/cuda-convnet2/nvmatrix.cuh" +#include "src/cuda/local/cuda-convnet2/cudaconv2.cuh" + +namespace megdnn { +namespace cuda { +namespace local { + +bool can_backward_data_proxy_convnet(size_t N, + size_t IC, size_t /* IH */, size_t /* IW */, + size_t /*OC*/, size_t /* OH */, size_t /* OW */, + size_t FH, size_t FW, + size_t /* INs */, size_t /* ONs */, + size_t PH, size_t PW, + size_t SH, size_t SW) +{ + bool flag = true; + // check pad + flag &= (PH == PW); + // check stride + flag &= (SH == SW); + // megdnn_assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0))); + flag &= (IC <= 3 || IC % 8 == 0); + // megdnn_assert(numFilters % (16 * numGroups) == 0); + //flag &= (OC % 16 == 0); + // megdnn_assert(filterSize * filterSize == filterPixels); + flag &= (FH == FW); + flag &= (SH <= FH); + flag &= (N % 32 == 0); + return flag; +} + +size_t get_workspace_in_floats_backward_data_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t /* FH */, size_t /* FW */, + size_t /* INs */, size_t /* ONs */, + size_t /* PH */, size_t /* PW */, + size_t /* SH */, size_t /* SW */) +{ + return N*IC*IH*IW + N*OC*OH*OW; +} + +void backward_data_proxy_convnet(const float *filter, + const float *diff, + float *grad, + float *workspace, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t /* PW */, + size_t SH, size_t /* SW */, + cublasHandle_t cublas_handle, + cudaStream_t stream, + float *one, float *zero) +{ + MemorySegment mhid_n(const_cast(diff)), + mfilter(const_cast(filter)), + mtarget_n(grad), + mtarget_t(workspace), + mhid_t(workspace+N*IC*IH*IW); + NVMatrix nvhid_n(&mhid_n, N, OC*OH*OW, ONs), + nvfilter(&mfilter, OH*OW*IC*FH*FW, OC), + nvtarget_n(&mtarget_n, N, IC*IH*IW, INs), + nvhid_t(&mhid_t, OC*OH*OW, N), + nvtarget_t(&mtarget_t, IC*IH*IW, N); + nvhid_n.transpose(nvhid_t, cublas_handle, one, zero); + + localImgActs(stream, nvhid_t, nvfilter, nvtarget_t, + IH, IW, OH, -static_cast(PH), SH, IC, 1); + after_kernel_launch(); + + nvtarget_t.transpose(nvtarget_n, cublas_handle, one, zero); +} + +} // namespace local +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/backward_filter.cpp b/dnn/src/cuda/local/backward_filter.cpp new file mode 100644 index 00000000..03a1d7d8 --- /dev/null +++ b/dnn/src/cuda/local/backward_filter.cpp @@ -0,0 +1,119 @@ +/** + * \file dnn/src/cuda/local/backward_filter.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local/opr_impl.h" + +#include "src/cuda/local/local.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { +namespace local { + +void boom_backward_filter() +{ + megdnn_throw("Local bad param: cannot do backward_filter by cuda_convnet"); +} + +} // namespace local +} // namespace cuda +} // namespace megdnn + +namespace megdnn { +namespace cuda { + +void LocalBackwardFilterImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(src.layout, diff.layout, grad.layout, workspace.size); + megdnn_assert(param().mode == Mode::CROSS_CORRELATION); + auto N = src.layout.shape[0], + IC = src.layout.shape[1], + IH = src.layout.shape[2], + IW = src.layout.shape[3]; + auto OC = diff.layout.shape[1], + OH = diff.layout.shape[2], + OW = diff.layout.shape[3]; + auto FH = grad.layout.shape[3], + FW = grad.layout.shape[4]; + auto handle = concrete_handle(this->handle()); + auto stream = cuda_stream(this->handle()); + auto cublas = cublas_handle(this->handle()); + auto one = handle->one_device(); + auto zero = handle->zero_device(); + if (use_cuda_convnet(src.layout, diff.layout, grad.layout)) { + local::backward_filter_proxy_convnet(src.ptr(), + diff.ptr(), + grad.ptr(), + reinterpret_cast(workspace.raw_ptr), + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + param().pad_h, param().pad_w, + param().stride_h, param().stride_w, + cublas, stream, + one, zero); + } else { + local::boom_backward_filter(); + } +} + +size_t LocalBackwardFilterImpl::get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) +{ + auto N = src.shape[0], + IC = src.shape[1], IH = src.shape[2], IW = src.shape[3], + OC = diff.shape[1], OH = diff.shape[2], OW = diff.shape[3], + FH = grad.shape[3], FW = grad.shape[4]; + auto SH = param().stride_h, SW = param().stride_w, + PH = param().pad_h, PW = param().pad_w; + size_t res = 0u; + if (use_cuda_convnet(src, diff, grad)) { + res = local::get_workspace_in_floats_backward_filter_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + SH, SW, + PH, PW) * sizeof(dt_float32); + } else { + local::boom_backward_filter(); + } + return res; +} + +bool LocalBackwardFilterImpl::use_cuda_convnet(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) +{ + auto N = src.shape[0], + IC = src.shape[1], IH = src.shape[2], IW = src.shape[3], + OC = diff.shape[1], OH = diff.shape[2], OW = diff.shape[3], + FH = grad.shape[3], FW = grad.shape[4]; + auto SH = param().stride_h, SW = param().stride_w, + PH = param().pad_h, PW = param().pad_w; + return local::can_backward_filter_proxy_convnet(N, IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + PH, PW, + SH, SW); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/backward_filter.cu b/dnn/src/cuda/local/backward_filter.cu new file mode 100644 index 00000000..8902b392 --- /dev/null +++ b/dnn/src/cuda/local/backward_filter.cu @@ -0,0 +1,94 @@ +/** + * \file dnn/src/cuda/local/backward_filter.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local/local.cuh" + +#include "src/cuda/utils.cuh" +#include "src/cuda/local/cuda-convnet2/nvmatrix.cuh" +#include "src/cuda/local/cuda-convnet2/cudaconv2.cuh" + +namespace megdnn { +namespace cuda { +namespace local { + +bool can_backward_filter_proxy_convnet(size_t N, + size_t IC, size_t /* IH */, size_t /* IW */, + size_t /*OC*/, size_t /* OH */, size_t /* OW */, + size_t FH, size_t FW, + size_t /* INs */, size_t /* ONs */, + size_t PH, size_t PW, + size_t SH, size_t SW) +{ + bool flag = true; + // check pad + flag &= (PH == PW); + // check stride + flag &= (SH == SW); + // megdnn_assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 16 == 0))); + flag &= (IC <= 3 || IC % 8 == 0); + // megdnn_assert(numFilters % (16 * numGroups) == 0); + //flag &= (OC % 16 == 0); + // megdnn_assert(filterSize * filterSize == filterPixels); + flag &= (FH == FW); + flag &= (SH <= FH); + flag &= (N % 32 == 0); + return flag; +} + +size_t get_workspace_in_floats_backward_filter_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t /* FH */, size_t /* FW */, + size_t /* INs */, size_t /* ONs */, + size_t /* PH */, size_t /* PW */, + size_t /* SH */, size_t /* SW */) +{ + return N*IC*IH*IW + N*OC*OH*OW; +} + +void backward_filter_proxy_convnet(const float *src, + const float *diff, + float *grad, + float *workspace, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t /* PW */, + size_t SH, size_t /* SW */, + cublasHandle_t cublas_handle, + cudaStream_t stream, + float *one, float *zero) +{ + MemorySegment mimage_n(const_cast(src)), + mhid_n(const_cast(diff)), + mimage_t(workspace), + mhid_t(workspace+N*IC*IH*IW), + mtarget(grad); + NVMatrix nvimage_n(&mimage_n, N, IC*IH*IW, INs), + nvhid_n(&mhid_n, N, OC*OH*OW, ONs), + nvimage_t(&mimage_t, IC*IH*IW, N), + nvhid_t(&mhid_t, OC*OH*OW, N), + nvtarget(&mtarget, OH*OW*IC*FH*FW, OC); + + nvhid_n.transpose(nvhid_t, cublas_handle, one, zero); + nvimage_n.transpose(nvimage_t, cublas_handle, one, zero); + + localWeightActs(stream, nvimage_t, nvhid_t, nvtarget, + IH, OH, OW, FH, -static_cast(PH), SH, IC, 1); + after_kernel_launch(); +} + +} // namespace local +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/cuda-convnet2/LICENSE b/dnn/src/cuda/local/cuda-convnet2/LICENSE new file mode 100644 index 00000000..fad671d6 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/LICENSE @@ -0,0 +1,217 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/LICENSE + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +The following are distributed along with the Software under the licenses +indicated below: + +cuda-convnet2 - Apache License, Version 2.0. You may obtain a copy of the + license at: http://www.apache.org/licenses/LICENSE-2.0. + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dnn/src/cuda/local/cuda-convnet2/cudaconv2.cuh b/dnn/src/cuda/local/cuda-convnet2/cudaconv2.cuh new file mode 100644 index 00000000..e846cff9 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/cudaconv2.cuh @@ -0,0 +1,93 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/cudaconv2.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ + + +#ifndef COMMON_CUH +#define COMMON_CUH + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#include "helper_cuda.h" // helper functions CUDA error checking and initialization +#include "nvmatrix.cuh" + +namespace megdnn { +namespace cuda { + +enum FILTER_OUTPUT_ORDER {MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE}; + +void convFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups); +void convFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups, + float scaleTargets, float scaleOutput); + +void localFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups); +void localFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups, + float scaleTargets, float scaleOutput); + +void convImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups); +void convImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, + float scaleTargets, float scaleOutput); + +void localImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups); +void localImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, + float scaleTargets, float scaleOutput); + +void convWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, + int moduleStride, int numImgColors, int numGroups, int sumWidth); +void convWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, + int numImgColors, int numGroups, int sumWidth, + float scaleTargets, float scaleOutput); + +void localWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, + int moduleStride, int numImgColors, int numGroups); + +void localWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, + int numImgColors, int numGroups, float scaleTargets, float scaleOutput); +} +} + +#endif /* COMMON_CUH */ + diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts.cu new file mode 100644 index 00000000..ef3d716d --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts.cu @@ -0,0 +1,1572 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ + +#include "nvmatrix.cuh" +#include "cudaconv2.cuh" +#include "src/cuda/utils.cuh" +#include "filter_acts/filter_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +__device__ __forceinline__ void filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(int fPidx, int imgLoadModPosY, int imgLoadModPosX, + int imgSizeX, int filterSize, int& iPidx) { + int x = imgLoadModPosX + (fPidx) % filterSize; + int y = imgLoadModPosY + (fPidx) / filterSize; + iPidx = y >= 0 && y < imgSizeX && x >= 0 && x < imgSizeX ? y * imgSizeX + x : -1; +} + +#define FA_COLOR3_IMPRELOAD(c,i) imPreload[c][i] = iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) ? 0 : mm[c * imgPixels * imgStride + i * B_X]; +#define FA_COLOR3_IMPRELOAD_TX(c,i) imPreload[c][i] = iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) ? 0 : tex1Dfetch(images, imagesOffset2 + c * imgPixels * imgStride + i * B_X); + + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + */ +template +//__launch_bounds__(128,3) +__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, + const int numImages, const int numFilters, + const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, + const int moduleStride, + const int numModulesY, const int numModulesX, const int imgStride, + const float scaleTargets, const float scaleOutputs, + const bool conv/*, const bool noloads*/) { + __shared__ float shFilters[numColors][pixelCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float shImages[numColors][pixelCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + __syncthreads(); + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int blocksPerModule = numFilters / (B_Y*filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + + const int numModules = numModulesX * numModulesY; + // Another fun insanity: the % B_X makes things faster, even thought threadIdx.x is + // in the range 0..31. It appears that this allows the compiler to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + +// images += myImgIdx; +// filters += blockFilterIdx +// + shFilterLoadY * numFilters + shFilterLoadX; +// if (!conv) { // NOTE: UNTESTED! +// filters += moduleIdx * numColors * filterPixels * numFilters; +// } + + const int imagesOffset = myImgIdx; + const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters + shFilterLoadX + + (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters); + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + + int iPidxNext; + float imPreload[numColors][imgsPerThread]; + float fPreload[numColors][pixelCache*filtersPerThread/B_X]; + + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int p = 0; p < pixelCache; p += B_X/filtersPerThread) { + if (p + shFilterLoadY < filterPixels) { + fPreload[c][p*filtersPerThread/B_X] = tex1Dfetch(filters, filtersOffset + p * numFilters + c * numFilters * filterPixels); + } else{ + fPreload[c][p*filtersPerThread/B_X] = 0; + } + } + } + + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); + + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (iPidxNext >= 0 && (!checkImgBounds || myImgIdx + i * B_X < numImages)) { + imPreload[c][i] = tex1Dfetch(images, imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X); + } else { + imPreload[c][i] = 0; + } + } + } + + for (int p = 0; p < filterPixels; p += pixelCache) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int c = 0; c < numColors; ++c) { + // NOTE: bank conflicts here! + shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i]; + } + } + + const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache; + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(fPidxNext + ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); + +// const float* ff = &filters[numFilters * fPidxNext]; +// const float* mm = &images[imgStride * iPidxNext]; + const int filtersOffset2 = filtersOffset + numFilters * fPidxNext; + const int imagesOffset2 = imagesOffset + imgStride * iPidxNext; + + FA_COLOR3_IMPRELOAD_TX(0,0); + FA_COLOR3_IMPRELOAD_TX(0,1); + FA_COLOR3_IMPRELOAD_TX(0,2); + FA_COLOR3_IMPRELOAD_TX(0,3); + + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int pp = 0; pp < pixelCache; pp += B_X/filtersPerThread) { + shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp*filtersPerThread/B_X]; + } + } + + __syncthreads(); + FA_COLOR3_IMPRELOAD_TX(1,0); + FA_COLOR3_IMPRELOAD_TX(1,1); + FA_COLOR3_IMPRELOAD_TX(1,2); + FA_COLOR3_IMPRELOAD_TX(1,3); + FA_COLOR3_IMPRELOAD_TX(2,0); + FA_COLOR3_IMPRELOAD_TX(2,1); + FA_COLOR3_IMPRELOAD_TX(2,2); + FA_COLOR3_IMPRELOAD_TX(2,3); + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int pp = 0; pp < pixelCache*filtersPerThread/B_X; pp++) { + fPreload[c][pp] = fPidxNext + pp*(B_X/filtersPerThread) + shFilterLoadY >= filterPixels ? 0 : tex1Dfetch(filters, filtersOffset2 + c * numFilters* filterPixels + pp*(B_X/filtersPerThread) * numFilters); + } + } + #pragma unroll + for (int pp = 0; pp < pixelCache; pp++) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] * shFilters[c][pp][ty * filtersPerThread + f]; + } + } + } + } + + __syncthreads(); + } + + if (scale) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; + } + } + } + } else { + // Note: reversing order of these loops saves 2 registers, but costs time + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; + } + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * This won't be pretty. + */ +template +__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, + const int numImages, const int numFilters, + const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, + const int moduleStride, + const int numModulesY, const int numModulesX, const int imgStride, + const float scaleTargets, const float scaleOutputs, + const bool conv/*, const bool noloads*/) { + __shared__ float shFilters[numColors][pixelCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float shImages[numColors][pixelCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + __syncthreads(); + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int blocksPerModule = numFilters / (B_Y*filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + + const int numModules = numModulesX * numModulesY; + // Another fun insanity: the % B_X makes things faster, even though threadIdx.x is + // in the range 0..31. It appears that this allows the compiler to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + const int warp = tidx / 32; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + +// images += myImgIdx; +// filters += blockFilterIdx +// + shFilterLoadY * numFilters + shFilterLoadX; +// if (!conv) { // NOTE: UNTESTED! +// filters += moduleIdx * numColors * filterPixels * numFilters; +// } + + const int imagesOffset = myImgIdx; + const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters + shFilterLoadX + + (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters); + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + + int iPidxNext; + float imPreload[numColors][imgsPerThread]; + float fPreload[numColors][DIVUP(pixelCache*filtersPerThread,B_X)]; + + if (warp < 3) { + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int p = 0; p < pixelCache; p += 2) { + if (p + shFilterLoadY < filterPixels) { + fPreload[c][p/2] = tex1Dfetch(filters, filtersOffset + p * numFilters + c * numFilters * filterPixels); + } else { + fPreload[c][p/2] = 0; + } + } + } + } + + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); + + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (iPidxNext >= 0 && (!checkImgBounds || myImgIdx + i * B_X < numImages)) { + imPreload[c][i] = tex1Dfetch(images, imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X); + } else { + imPreload[c][i] = 0; + } + } + } + + for (int p = 0; p < filterPixels; p += pixelCache) { + const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache; + filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(fPidxNext + ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext); + + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i]; + } + } + + if (warp < 3) { + #pragma unroll + for (int c = 0; c < numColors; ++c) { + #pragma unroll + for (int pp = 0; pp < pixelCache; pp += 2) { + shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp/2]; + } + } + } + + __syncthreads(); +// const float* ff = &filters[numFilters * fPidxNext]; +// const float* mm = &images[imgStride * iPidxNext]; + const int filtersOffset2 = filtersOffset + numFilters * fPidxNext; + const int imagesOffset2 = imagesOffset + imgStride * iPidxNext; + + #pragma unroll + for (int i = 0; i < imgsPerThread; ++i) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + FA_COLOR3_IMPRELOAD_TX(c,i); + } + } + + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int pp = 0; pp < 2; pp++) { + fPreload[c][pp] = warp >= 3 || fPidxNext + pp*2 + shFilterLoadY >= filterPixels ? 0 : tex1Dfetch(filters, filtersOffset2 + c * numFilters* filterPixels + pp*2 * numFilters); + } + #pragma unroll + for (int pp = 0; pp < pixelCache; pp++) { + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] * shFilters[c][pp][ty * filtersPerThread + f]; + } + } + } + + } + __syncthreads(); + } + + if (scale) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; + } + } + } + } else { + // Note: reversing order of these loops costs 2 registers, but saves time + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; + } + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * Note: in git there's a 1.5% faster version of this which sues 167 registers instead of 154... + * it's basically the same thing, but it doesn't do the next-pixel computation. It just avoids + * pre-loading when it rolls over to the next pixel. + */ +template +__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(float* images, float* filters, float* targets, + const int numImages, const int numFilters, + const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, + const int moduleStride, + const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors, + const int numGroups, + const float scaleTargets, const float scaleOutputs, + const bool conv/*, const bool noloads*/) { + __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + __syncthreads(); + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y*filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + // Another fun insanity: the % B_X makes things faster, even thought threadIdx.x is + // in the range 0..31. It appears that this allows the compiler to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + + images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + filters +=blockFilterIdx + + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numFilterColors * filterPixels * numFilters; + } + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; +// float fCache[filtersPerThread]; + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + // NOTE: these max/min functions increase register usage as compared to my macros + const int imgStartX = max(0, imgLoadModPosX); + const int imgStartY = max(0, imgLoadModPosY); + const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY); +// __shared__ int imgPos[] + + int fPidx, iPidx; + float imPreload[imgsPerThread]; + float fPreload[colorCache*filtersPerThread/B_X]; +// float fCache[filtersPerThread]; + + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgStartY, imgStartX, fPidx, iPidx); + + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + imPreload[i] = images[imgStride * iPidx + i * B_X]; + } else { + imPreload[i] = 0; + } + } + if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { // This if statement reduces reg usage.. + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + fPreload[c*filtersPerThread/B_X] = filters[(c * filterPixels + fPidx) * numFilters]; + } + } + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { +// const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { +// const int filterPxX = imgX - imgLoadModPosX; +// const int p = filterPxY * filterSize + filterPxX; +// const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in img +// setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgY, imgX, &p, &pixIdx); +// float* m = &images[imgStride * pixIdx]; + const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1; + int imgYNext = imgY; + int imgXNext = imgX; + int fPidxNext, iPidxNext; + if (!lastPixel) { + imgYNext = imgY + (imgX + 1 == imgEndX); + imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1; + } + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgYNext, imgXNext, fPidxNext, iPidxNext); + for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) + const float* ff = &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)]; + const float* mm = &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)]; + if (oc == numFilterColors - colorCache) { + ff = &filters[fPidxNext * numFilters]; + mm = &images[iPidxNext * imgStride]; + fPidx = fPidxNext; + iPidx = iPidxNext; + } + + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + shFilters[c + shFilterLoadY][shFilterLoadX] = fPreload[c*filtersPerThread/B_X]; + } + + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + shImages[ty][tx * imgsPerThread + i] = imPreload[i]; + } + imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages) ? 0 : mm[0 * B_X]; + imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages) ? 0 : mm[1 * B_X]; + imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages) ? 0 : mm[2 * B_X]; + + __syncthreads(); + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] * shFilters[0][threadIdx.y * filtersPerThread + f]; + } + } + + fPreload[0] = ff[0]; + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] * shFilters[1][threadIdx.y * filtersPerThread + f]; + } + } + + fPreload[1] = ff[(B_X/filtersPerThread * filterPixels) * numFilters]; + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] * shFilters[2][threadIdx.y * filtersPerThread + f]; + } + } + + imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages) ? 0 : mm[3 * B_X]; + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] * shFilters[3][threadIdx.y * filtersPerThread + f]; + } + } + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; + } + } + } + } else { + // Note: reversing order of these loops saves 2 registers, but costs time + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; + } + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModules, numImages) + * + * Note: all of these convolution routines are optimized for the case when + * the number of images (i.e. the minibatch size) is a multiple of 128. + * Other batch sizes will work, but but I made no attempt whatsoever + * to make them work fast. + */ + void _filterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups, + float scaleTargets, float scaleOutput, bool conv) { + int numFilterColors = numImgColors / numGroups; + int numFilters = filters.getNumCols(); + int numModules = numModulesY * numModulesX; + int numImages = images.getNumCols(); + int imgPixels = images.getNumRows()/numImgColors; + int imgSizeX = imgPixels / imgSizeY; + int filterModuleMult = conv ? 1 : numModules; + + megdnn_assert_internal(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0))); + megdnn_assert_internal(numGroups == 1 || numFilterColors % 4 == 0); + //megdnn_assert_internal(numFilters % (16 * numGroups) == 0); + megdnn_assert_internal(numImgColors % numGroups == 0); + bool previous_limit = (numFilters % (16 * numGroups)) == 0; + + //images.printShape("images"); + //printf("rows: %d, pixels: %d, colors: %d\n", images.getNumRows(), imgPixels, numImgColors); + //images.printShape("images"); + megdnn_assert_internal(images.getNumRows() == imgPixels * numImgColors); + megdnn_assert_internal(imgSizeY * imgSizeX == imgPixels); + int numFiltersPerGroup = numFilters / numGroups; + + int imgStride = images.getStride(); // images does not need to be a contiguous matrix + + int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors); + int filterSize = int(sqrt(filterPixels)); + megdnn_assert_internal(filterSize * filterSize == filterPixels); + megdnn_assert_internal(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels); + + // These routines don't handle the case when only part of the image is visited in the convolution + megdnn_assert_internal(paddingStart <= 0); + megdnn_assert_internal(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX); + megdnn_assert_internal(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY); + megdnn_assert_internal(moduleStride <= filterSize); + + megdnn_assert_internal(!images.isTrans()); + megdnn_assert_internal(!filters.isTrans()); + megdnn_assert_internal(!targets.isTrans()); + + megdnn_assert_internal(filters.isContiguous()); + megdnn_assert_internal(targets.isContiguous()); + int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + int filtersPerThread, threadsY = 4; + if (numImgColors <= 3) { + // Special kernels written for colors = 3, filters = 64 and colors = 3, filters = 48 cases. + // The remaining cases use the old routines. + // TODO: Modernize the remaining cases if you care about them. + filtersPerThread = numFiltersPerGroup % 64 == 0 ? 16 : numFiltersPerGroup % 48 == 0 ? 12 : numFiltersPerGroup % 32 == 0 ? 8 : 4; + } else { + filtersPerThread = numFiltersPerGroup % 64 == 0 ? 16 : numFiltersPerGroup % 32 == 0 ? 8 : 4; + threadsY = numFiltersPerGroup % 128 == 0 && numFilterColors % 8 == 0 && imgsPerThread != 4 ? 8 : 4; + } + int threadsX = 32; + dim3 threads(threadsX, threadsY); + dim3 blocks = dim3(DIVUP(numImages, threads.x * imgsPerThread), numModules * DIVUP(numFilters, (threads.y * filtersPerThread))); + + bool checkImgBounds = numImages % (threads.x*imgsPerThread) != 0; + bool scale = scaleTargets != 0; + if (scaleTargets == 0) { + targets.resize(numFilters * numModules, numImages); + } else { + megdnn_assert_internal(targets.getNumRows() == numFilters * numModules); + megdnn_assert_internal(targets.getNumCols() == numImages); + } + + // Auto-generated calling code... + // NOTE: The calling code is set up such that if checkImgBounds is true, then imgsPerThread = 1. + // In principle it doesn't have to be this way, and you may want to optimize for that case. + + if (scale == false) { + if (checkImgBounds == false) { + if (numFilterColors % 8 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + if (previous_limit) { + if (images.getNumDataBytes() < TEXTURE_SIZE_MAX) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 64 == 0) { + if (previous_limit) { + if (images.getNumDataBytes() < TEXTURE_SIZE_MAX) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, false, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors % 4 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 3) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, false, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(),numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 16, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 48 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, false, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(),numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 12, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 8, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 4, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 16, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 12, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 8, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 4, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 2) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 16, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 12, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 8, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 4, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 16, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 12, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 8, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 4, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 1) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 16, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 12, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 8, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 4, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 16, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 12, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 8, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 4, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + } + else if (checkImgBounds == true) { + if (numFilterColors % 8 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors % 4 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 3) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 3, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 3, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 3, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 3, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 2) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 2, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 2, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 2, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 2, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 1) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 1, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 1, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 1, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 1, 4, false, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + } + } + else if (scale == true) { + if (checkImgBounds == false) { + if (numFilterColors % 8 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + if (previous_limit) { + if (images.getNumDataBytes() < TEXTURE_SIZE_MAX) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 64 == 0) { + if (previous_limit) { + if (images.getNumDataBytes() < TEXTURE_SIZE_MAX) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex < 4, 32, 4, 16, 4, true, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferL1); + filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4 < 4, 32, 4, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } else { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors % 4 == 0) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 8, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 4, 4, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 8, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 2, 4, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 3) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex < 4, 32, 4, 16, 3, 4, true, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(),numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 16, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 48 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex < 4, 32, 4, 12, 3, 4, true, false > <<>>(images.getTextureObject(), filters.getTextureObject(), targets.getDevData(),numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } else { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 12, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 8, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 4, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 16, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 12, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 8, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 4, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 2) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 16, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 12, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 8, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 4, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 16, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 12, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 8, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 4, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 1) { + if (numImages % 128 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 16, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 16, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 12, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 12, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 8, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 8, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 4, 4, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 4, 4, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 64 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 16, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 16, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 12, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 12, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 8, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 8, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 2, 4, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 2, 4, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + else if (numImages % 32 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + } + else if (checkImgBounds == true) { + if (numFilterColors % 8 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors % 4 == 0) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 16, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 8, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2 < 4, 32, 1, 4, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 3) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 3, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 3, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 3, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 3, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 2) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 2, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 2, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 2, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 2, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + else if (numFilterColors == 1) { + if (numImages % 1 == 0) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 16, 1, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 12, 1, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 8, 1, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, true >, cudaFuncCachePreferShared); + filterActs_YxX_color < 4, 32, 1, 4, 1, 4, true, true > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv); + } + } + } + } + } + + getLastCudaError("filterActs: kernel execution failed"); +} + +void convFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups) { + convFilterActs(stream, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1); +} + +void convFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups, + float scaleTargets, float scaleOutput) { + _filterActs(stream, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true); +} + +void localFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups) { + localFilterActs(stream, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1); +} + +void localFilterActs(cudaStream_t stream, NVMatrix& images, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride, + int numImgColors, int numGroups, + float scaleTargets, float scaleOutput) { + _filterActs(stream, images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false); +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color.cuh b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color.cuh new file mode 100644 index 00000000..d15040bc --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color.cuh @@ -0,0 +1,270 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images. + * threadIdx.x determines image + * threadIdx.y determines filter + * + * blockIdx.x determines image batch of B_X * imgsPerThread + * blockIdx.y determines filter batch of module and B_Y * filtersPerThread + * + * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numColors, filterPixels, numFilters) if conv + * (numModules, numColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * + * Number of filters per module should be divisible by B_Y * filtersPerThread + * checkImgBounds indicates whether number of images is divisible by B_X * imgsPerThread + * + * The imgSize here is the size of the actual image without the padding. + * + */ + template +__global__ void filterActs_YxX_color(FILTER_COLOR_PARAMS) { + __shared__ float shFilters[pixelCache*numColors][B_Y * filtersPerThread]; // pre-load pixelCache pixels from B_Y*filtersPerThread filters + __shared__ float shImages[pixelCache*numColors][B_X * imgsPerThread]; // pre-load pixelCache pixels from B_X*imgsPerThread images + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + __syncthreads(); + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + + const int blocksPerModule = DIVUP(numFilters, (B_Y*filtersPerThread)); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + + const int tidx = threadIdx.y * B_X + threadIdx.x; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + const int numModules = numModulesY * numModulesX; + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + images += myImgIdx; + filters += blockFilterIdx + + shFilterLoadY * numFilters + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numColors * filterPixels * numFilters; + } + bool active_thread_y = (blockFilterIdx + shFilterLoadX) < numFilters; + + targets += moduleIdx * numImages + + myImgIdx + + (blockFilterIdx + threadIdx.y*filtersPerThread) * numImages * numModulesY * numModulesX; + + + float prod[filtersPerThread][imgsPerThread]; + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for(int g = 0; g < imgsPerThread; g++) { + prod[f][g] = 0; + } + } + //float* shImgLoad = &shImages[0][threadIdx.x]; + for (int p = 0; p < filterPixels; p += pixelCache) { + /* + * Load pixelCache pixels from B_Y*filtersPerThread filters + * This condition covers the case when B_X is not divisible by filtersPerThread. + * In this case, not all of the threads will participate in the loading operation. + * This ensures that in each loop iteration, an integer number of rows of shFilters + * are filled, which makes indexing simple. + */ + if (B_X % filtersPerThread == 0 || shFilterLoadY < B_X/filtersPerThread) { + #pragma unroll + for (int p2 = 0; p2 < pixelCache; p2 += B_X/filtersPerThread) { + const bool omit = pixelCache % (B_X / filtersPerThread) == 0; + const int preloadPx = shFilterLoadY + p2; + if (omit || preloadPx < pixelCache) { + if (p + preloadPx < filterPixels && active_thread_y) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = filters[(c * filterPixels + p + p2) * numFilters]; + } + } else { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = 0; + } + } + } + } + } + + /* + * Load pixelCache pixels from B_X*imgsPerThread images. + */ + #pragma unroll + for (int ly = 0; ly < pixelCache; ly += B_Y) { + const int preloadPx = ly + threadIdx.y; + const int pixIdx = p + preloadPx; + const bool omit = pixelCache % B_Y == 0; // Compile-time condition + /* + * Don't load any image pixels corresponding to filter pixels that don't exist. + */ + if (pixIdx < filterPixels && (omit || preloadPx < pixelCache)) { + const int x = imgLoadModPosX + pixIdx % filterSize; + const int y = imgLoadModPosY + pixIdx / filterSize; + + if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) { + float* m = &images[imgStride * (y * imgSizeX + x)]; + + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + shImages[preloadPx + c * pixelCache][threadIdx.x * imgsPerThread + i] = m[c * imgStride * imgPixels + i * B_X]; + } else { + shImages[preloadPx + c * pixelCache][threadIdx.x * imgsPerThread + i] = 0; + } + } + } + } else { // Padding + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[preloadPx + c * pixelCache][threadIdx.x * imgsPerThread + i] = 0; + } + } + } + } + } + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < pixelCache*numColors; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for(int g = 0; g < imgsPerThread; g++) { + prod[f][g] += shImages[i][g + threadIdx.x * imgsPerThread] + * shFilters[i][threadIdx.y * filtersPerThread + f]; + } + } + } + __syncthreads(); + } + + int filtersThisThread = numFilters - blockFilterIdx - threadIdx.y * filtersPerThread; + if (filtersThisThread > filtersPerThread) { + filtersThisThread = filtersPerThread; + } + + //active_thread_y = (blockFilterIdx + threadIdx.y * filtersPerThread) < numFilters; + if (scale) { + #pragma unroll + for (int f = 0; f < filtersThisThread; f++) { + #pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + targets[g * B_X + f * numImages * numModules] = + scaleTargets * targets[g * B_X + f * numImages * numModules] + scaleOutputs * prod[f][g]; + } + } + } + } else { + #pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + #pragma unroll + for (int f = 0; f < filtersThisThread; f++) { + //if (active_thread_y) { + targets[g * B_X + f * numImages * numModules] = scaleOutputs * prod[f][g]; + //} + } + } + } + } +} + + +#define FILTER_COLOR_HEAD template __global__ void filterActs_YxX_color + +#define FILTER_COLOR(scale, ckImg) \ +FILTER_COLOR_HEAD < 4, 32, 4, 8, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 4, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 2, 16, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 12, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 8, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 4, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 1, 16, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 12, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 8, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 4, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 4, 16, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 12, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 8, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 4, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 2, 16, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 12, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 8, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 4, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 1, 16, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 12, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 8, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 4, 2, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 4, 16, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 12, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 8, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 4, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 2, 16, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 12, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 8, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 2, 4, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + \ +FILTER_COLOR_HEAD < 4, 32, 1, 16, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 12, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 8, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 1, 4, 1, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +\ +FILTER_COLOR_HEAD < 4, 32, 4, 16, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ +FILTER_COLOR_HEAD < 4, 32, 4, 12, 3, 4, scale, ckImg > (FILTER_COLOR_PARAMS); \ + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg0.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg0.cu new file mode 100644 index 00000000..a8d65af6 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg0.cu @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg0.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_color.cuh" +namespace megdnn { +namespace cuda { + +FILTER_COLOR(false, false) +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg1.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg1.cu new file mode 100644 index 00000000..1c0a782a --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg1.cu @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale0_ckimg1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_color.cuh" +namespace megdnn { +namespace cuda { + +FILTER_COLOR(false, true) +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg0.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg0.cu new file mode 100644 index 00000000..6f96b48d --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg0.cu @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg0.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_color.cuh" +namespace megdnn { +namespace cuda { + +FILTER_COLOR(true, false) +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg1.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg1.cu new file mode 100644 index 00000000..969ce077 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg1.cu @@ -0,0 +1,37 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_color_scale1_ckimg1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_color.cuh" +namespace megdnn { +namespace cuda { + +FILTER_COLOR(true, true) +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2.cuh b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2.cuh new file mode 100644 index 00000000..6bdb9fdc --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2.cuh @@ -0,0 +1,261 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images. + * threadIdx.x determines image + * threadIdx.y determines filter + * + * blockIdx.x determines image batch of B_X * imgsPerThread + * blockIdx.y determines filter batch of B_Y * filtersPerThread + * + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * B_Y one of 4, 8, 16 + * B_X one of 16, 32 + * imgsPerThread one of 1, 2, 4 + * filtersPerThread one of 1, 2, 4, 8 + * colorCache: how many colors to put into shmem + * + * numFilters should be divisible by B_Y * filtersPerThread + * numImages be divisible by B_X * imgsPerThread + * numFilterColors should be divisible by colorCache. + * numImgColors must be even. + * numFilters must be divisible by numGroups. + * no restrictions on pixelCache + * The imgSize here is the size of the actual image without the padding. + * As always, try to make B_X * imgsPerThread == B_Y * filtersPerThread for maximum efficiency. + * + */ +template +__global__ void filterActs_YxX_sparse2(float* images, float* filters, float* targets, + const int numImages, const int numFilters, + const int imgSizeY, const int imgSizeX, + const int filterSize, const int paddingStart, + const int moduleStride, + const int numModulesY, const int numModulesX, + const int imgStride, const int numImgColors, + const int numGroups, + const float scaleTargets, const float scaleOutputs, + const bool conv) { + __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + __syncthreads(); + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = DIVUP(numFilters, (B_Y*filtersPerThread)); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + + const int tidx = threadIdx.y * B_X + threadIdx.x; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + + images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + filters +=blockFilterIdx + shFilterLoadX + + shFilterLoadY * numFilters * filterPixels; + if (!conv) { + filters += moduleIdx * numFilterColors * filterPixels * numFilters; + } + bool active_thread_y = (blockFilterIdx + shFilterLoadX) < numFilters; + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y) * numImages * numModules + + myImgIdx; + + float prod[filtersPerThread][imgsPerThread]; + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for(int g = 0; g < imgsPerThread; g++) { + prod[f][g] = 0; + } + } + const int imgStartX = MAX(0, imgLoadModPosX); + const int imgStartY = MAX(0, imgLoadModPosY); + const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY); +// __shared__ int imgPos[] + + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { + const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { + const int filterPxX = imgX - imgLoadModPosX; + const int p = filterPxY * filterSize + filterPxX; + for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) + + /* + * Load a pixel from B_Y*filtersPerThread filters + * This condition covers the case when B_X is not divisible by filtersPerThread. + * In this case, not all of the threads will participate in the loading operation. + * This ensures that in each loop iteration, an integer number of rows of shFilters + * are filled, which makes indexing simple. + + * nvcc is behaving in a completely insane way: removing this condition under + * template parameters that guarantee it to be true actually slows down + * the computation. + * + */ + if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + if (colorCache % (B_X/filtersPerThread) == 0 || c + shFilterLoadY < colorCache) { + if (active_thread_y) { + shFilters[c + shFilterLoadY][shFilterLoadX] = filters[((oc+c) * filterPixels + p) * numFilters]; + } else { + shFilters[c + shFilterLoadY][shFilterLoadX] = 0; + } + } + } + } + + /* + * Load a pixel from B_X*imgsPerThread images. + */ + const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in img + + float* m = &images[imgStride * (oc * imgPixels + pixIdx)]; + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y) { + if (colorCache % B_Y == 0 || threadIdx.y + c < colorCache) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + shImages[c + threadIdx.y][threadIdx.x + i * B_X] = m[c * imgStride * imgPixels + i * B_X]; + } else { + shImages[c + threadIdx.y][threadIdx.x + i * B_X] = 0; + } + } + } + } + + __syncthreads(); + + for (int c = 0; c < colorCache; c++) { + #pragma unroll + for(int g = 0; g < imgsPerThread; g++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[f][g] += shImages[c][g * B_X + threadIdx.x] * shFilters[c][threadIdx.y + f * B_Y]; + } + } + } + __syncthreads(); + } + } + } + + int filtersThisThread = filtersPerThread; + //if(checkFilterBounds) { + int filtersThisBlock = numFilters - (blockIdx.y % blocksPerModule) + * (B_Y*filtersPerThread); + if (filtersThisBlock < (B_Y * filtersPerThread)) { + filtersThisThread = (filtersThisBlock - threadIdx.y + filtersPerThread - 1) / filtersPerThread; + } + //} + + if (scale) { + #pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + #pragma unroll + for (int f = 0; f < filtersThisThread; f++) { + targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g]; + } + } + } + } else { + // Note: reversing order of these loops saves 2 registers, but costs time + #pragma unroll + for (int f = 0; f < filtersThisThread; f++) { + #pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g]; + } + } + } + } +} + +#define FILTER_SPARSE2_HEAD template __global__ void filterActs_YxX_sparse2 + +// +#define FILTER_SPARSE2(scale, ckImg) \ +FILTER_SPARSE2_HEAD < 4, 32, 4, 8, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 4, 4, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +\ +FILTER_SPARSE2_HEAD < 8, 32, 2, 16, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 2, 16, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 2, 8, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 2, 4, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +\ +FILTER_SPARSE2_HEAD < 8, 32, 1, 16, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 1, 16, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 1, 8, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 1, 4, 8, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +\ +FILTER_SPARSE2_HEAD < 4, 32, 4, 16, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 4, 8, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 4, 4, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +\ +FILTER_SPARSE2_HEAD < 4, 32, 2, 16, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 2, 8, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 2, 4, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +\ +FILTER_SPARSE2_HEAD < 4, 32, 1, 16, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 1, 8, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); \ +FILTER_SPARSE2_HEAD < 4, 32, 1, 4, 4, scale, ckImg > (FILTER_SPARSE2_PARAMS); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg0.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg0.cu new file mode 100644 index 00000000..5b56a76c --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg0.cu @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg0.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_sparse2.cuh" +namespace megdnn { +namespace cuda { + +FILTER_SPARSE2(false, false) + + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg1.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg1.cu new file mode 100644 index 00000000..bca263a7 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg1.cu @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale0_ckimg1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_sparse2.cuh" +namespace megdnn { +namespace cuda { + +FILTER_SPARSE2(false, true) + + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg0.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg0.cu new file mode 100644 index 00000000..d643271c --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg0.cu @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg0.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_sparse2.cuh" +namespace megdnn { +namespace cuda { + +FILTER_SPARSE2(true, false) + + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg1.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg1.cu new file mode 100644 index 00000000..764c01c1 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg1.cu @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_scale1_ckimg1.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_sparse2.cuh" +namespace megdnn { +namespace cuda { + +FILTER_SPARSE2(true, true) + + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_y4x32i4f16c4_tex.cu b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_y4x32i4f16c4_tex.cu new file mode 100644 index 00000000..e924b72f --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_y4x32i4f16c4_tex.cu @@ -0,0 +1,239 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_sparse2_y4x32i4f16c4_tex.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "filter_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +template +__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex (FILTER_ACTS_PARAMS) { + __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + __syncthreads(); + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y*filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + // Another fun insanity: the % B_X makes things faster, even thought threadIdx.x is + // in the range 0..31. It appears that this allows the compiler to optimize? + const int tx = threadIdx.x % B_X; + const int ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + threadIdx.x; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; + const int imgOffset = (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + +// images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + const int filterOffset = blockFilterIdx + + shFilterLoadY * numFilters * filterPixels + shFilterLoadX + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters); +// filters +=blockFilterIdx +// + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; +// if (!conv) { +// filters += moduleIdx * numFilterColors * filterPixels * numFilters; +// } + + targets += moduleIdx * numImages + + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + + myImgIdx; + + float prod[imgsPerThread][filtersPerThread]; +// float fCache[filtersPerThread]; + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] = 0; + } + } + // NOTE: these max/min functions increase register usage as compared to my macros + const int imgStartX = max(0, imgLoadModPosX); + const int imgStartY = max(0, imgLoadModPosY); + const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY); +// __shared__ int imgPos[] + + int fPidx, iPidx; + float imPreload[imgsPerThread]; // [4] + float fPreload[colorCache*filtersPerThread/B_X]; // [2] +// float fCache[filtersPerThread]; + + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgStartY, imgStartX, fPidx, iPidx); + + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + imPreload[i] = tex1Dfetch(images, imgOffset + imgStride * iPidx + i * B_X); + } else { + imPreload[i] = 0; + } + } + if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { // This if statement reduces reg usage.. + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + fPreload[c*filtersPerThread/B_X] = tex1Dfetch(filters, filterOffset + (c * filterPixels + fPidx) * numFilters); + } + } + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { +// const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { +// const int filterPxX = imgX - imgLoadModPosX; +// const int p = filterPxY * filterSize + filterPxX; +// const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in img +// setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgY, imgX, &p, &pixIdx); +// float* m = &images[imgStride * pixIdx]; + const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1; + int imgYNext = imgY; + int imgXNext = imgX; + int fPidxNext, iPidxNext; + if (!lastPixel) { + imgYNext = imgY + (imgX + 1 == imgEndX); + imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1; + } + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgYNext, imgXNext, fPidxNext, iPidxNext); + for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) +// const float* ff = &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)]; +// const float* mm = &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)]; + int imgOffset2 = imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx); + int filterOffset2 = filterOffset + numFilters * ((oc + colorCache) * filterPixels + fPidx); + if (oc == numFilterColors - colorCache) { + filterOffset2 = filterOffset + fPidxNext * numFilters; + imgOffset2 = imgOffset + iPidxNext * imgStride; + fPidx = fPidxNext; + iPidx = iPidxNext; + } + + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + shFilters[c + shFilterLoadY][shFilterLoadX] = fPreload[c*filtersPerThread/B_X]; + } + + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + shImages[ty][tx * imgsPerThread + i] = imPreload[i]; + } + imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 0 * B_X); + imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 1 * B_X); + imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 2 * B_X); + + __syncthreads(); + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] * shFilters[0][threadIdx.y * filtersPerThread + f]; + } + } + + fPreload[0] = tex1Dfetch(filters, filterOffset2 + 0); + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] * shFilters[1][threadIdx.y * filtersPerThread + f]; + } + } + + fPreload[1] = tex1Dfetch(filters, filterOffset2 + (B_X/filtersPerThread * filterPixels) * numFilters); + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] * shFilters[2][threadIdx.y * filtersPerThread + f]; + } + } + + imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 3 * B_X); + + #pragma unroll + for(int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] * shFilters[3][threadIdx.y * filtersPerThread + f]; + } + } + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; + } + } + } + } else { + // Note: reversing order of these loops saves 2 registers, but costs time + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (!checkImgBounds || myImgIdx + i * B_X < numImages) { + targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; + } + } + } + } +} + +template __global__ void +filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex +< 4, 32, 4, 16, 4, false, false >(FILTER_ACTS_PARAMS); + +template __global__ void +filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex +< 4, 32, 4, 16, 4, true, false >(FILTER_ACTS_PARAMS); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_templates.cuh b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_templates.cuh new file mode 100644 index 00000000..46b204a8 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_templates.cuh @@ -0,0 +1,155 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/filter_acts/filter_act_templates.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "../nvmatrix.cuh" +#include "../cudaconv2.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +__device__ inline void + filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords + (int filterSize, int imgSizeX, + int imgLoadModPosY, int imgLoadModPosX, + int imgY, int imgX, int& fPidx, int& iPidx) { + int filterPxY = imgY - imgLoadModPosY; + int filterPxX = imgX - imgLoadModPosX; + fPidx = filterPxY * filterSize + filterPxX; + iPidx = imgY * imgSizeX + imgX; // Pixel index in img +} + +#define FILTER_ACTS_PARAMS cudaTextureObject_t images, \ + cudaTextureObject_t filters, float* targets, \ + const int numImages, const int numFilters, \ + const int imgSizeY, const int imgSizeX, \ + const int filterSize, const int paddingStart, \ + const int moduleStride, \ + const int numModulesY, const int numModulesX, \ + const int imgStride, const int numImgColors, \ + const int numGroups, \ + const float scaleTargets, const float scaleOutputs, \ + const bool conv/*, const bool noloads*/ +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + */ +template +__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex (FILTER_ACTS_PARAMS); + + + +#define FILTER_COLOR_PARAMS float* images, float* filters, float* targets, \ + const int numImages, const int numFilters, \ + const int imgSizeY, const int imgSizeX, \ + const int filterSize, const int paddingStart, \ + const int moduleStride, \ + const int numModulesY, const int numModulesX, \ + const int imgStride, \ + const float scaleTargets, const float scaleOutputs, \ + const bool conv +/* + * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images. + * threadIdx.x determines image + * threadIdx.y determines filter + * + * blockIdx.x determines image batch of B_X * imgsPerThread + * blockIdx.y determines filter batch of module and B_Y * filtersPerThread + * + * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numColors, filterPixels, numFilters) if conv + * (numModules, numColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * + * Number of filters per module should be divisible by B_Y * filtersPerThread + * checkImgBounds indicates whether number of images is divisible by B_X * imgsPerThread + * + * The imgSize here is the size of the actual image without the padding. + * + */ + template +__global__ void filterActs_YxX_color(FILTER_COLOR_PARAMS); + + + + +#define FILTER_SPARSE2_PARAMS float* images, float* filters, float* targets, \ + const int numImages, const int numFilters, \ + const int imgSizeY, const int imgSizeX, \ + const int filterSize, const int paddingStart, \ + const int moduleStride, \ + const int numModulesY, const int numModulesX, \ + const int imgStride, const int numImgColors, \ + const int numGroups, \ + const float scaleTargets, const float scaleOutputs, \ + const bool conv +/* + * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images. + * threadIdx.x determines image + * threadIdx.y determines filter + * + * blockIdx.x determines image batch of B_X * imgsPerThread + * blockIdx.y determines filter batch of B_Y * filtersPerThread + * + * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * + * targets: (numFilters, numModulesY, numModulesX, numImages) + * + * B_Y one of 4, 8, 16 + * B_X one of 16, 32 + * imgsPerThread one of 1, 2, 4 + * filtersPerThread one of 1, 2, 4, 8 + * colorCache: how many colors to put into shmem + * + * numFilters should be divisible by B_Y * filtersPerThread + * numImages be divisible by B_X * imgsPerThread + * numFilterColors should be divisible by colorCache. + * numImgColors must be even. + * numFilters must be divisible by numGroups. + * no restrictions on pixelCache + * The imgSize here is the size of the actual image without the padding. + * As always, try to make B_X * imgsPerThread == B_Y * filtersPerThread for maximum efficiency. + * + */ +template +__global__ void filterActs_YxX_sparse2(FILTER_SPARSE2_PARAMS); + +} // namespace megdnn +} // namespace cuda diff --git a/dnn/src/cuda/local/cuda-convnet2/helper_cuda.h b/dnn/src/cuda/local/cuda-convnet2/helper_cuda.h new file mode 100644 index 00000000..73b3426c --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/helper_cuda.h @@ -0,0 +1,25 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/helper_cuda.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * \file src/cuda/local/cuda-convnet2/helper_cuda.h + * + * This file is part of MegDNN, a deep neural network run-time library * developed by Megvii. + * + * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + */ + +#pragma once +#include "src/cuda/utils.cuh" +#include +#define checkCudaErrors(x) cuda_check(x) +#define getLastCudaError(x) cuda_check(cudaGetLastError()) + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts.cu b/dnn/src/cuda/local/cuda-convnet2/img_acts.cu new file mode 100644 index 00000000..ea1e7ed1 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts.cu @@ -0,0 +1,1042 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/* + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ + +#include "cudaconv2.cuh" + +#include "nvmatrix.cuh" +#include "img_acts/img_act_templates.cuh" + +#ifdef _WIN32 +#define _Pragma(x) +#endif + +namespace megdnn { +namespace cuda { +/* + * New Titan-optimized stuff. + */ + +__device__ __forceinline__ void conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(const int my, const int mx, const int numModulesX, + const int paddingStart, const int moduleStride, const int blockPixelIdxY, const int blockPixelIdxX, const int filterSize, int &moduleIdx, int &pxIdxInFilter) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + moduleIdx = my * numModulesX + mx; // out + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; // out +} + +#define IA_PRELOAD_LOOP(w,offset) _Pragma("unroll") \ +for (int i = 0; i < imgsPerThread; i++) { \ + _Pragma("unroll") \ + for (int c = 0; c < colorsPerThread; c++) { \ + prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w)+(offset)] * shHidActs[w][threadIdx.x * imgsPerThread + i]; \ + } \ +} \ + +/* + * Same loop as above but inverted. + */ +#define IA_PRELOAD_LOOP2(w,offset) _Pragma("unroll") \ +for (int c = 0; c < colorsPerThread; c++) { \ + _Pragma("unroll") \ + for (int i = 0; i < imgsPerThread; i++) { \ + prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w)+(offset)] * shHidActs[w][threadIdx.x * imgsPerThread + i]; \ + } \ +} \ + +#define IA_PRELOAD_LOOP3(i,offset) _Pragma("unroll") \ +for (int w = 0; w < filterCacheH; w++) { \ + _Pragma("unroll") \ + for (int c = 0; c < colorsPerThread; c++) { \ + prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w)+(offset)] * shHidActs[w][threadIdx.x * imgsPerThread + i]; \ + } \ +} \ + +#define IA_PRELOAD_W(z) wPreload[z] = fLoad[(z) * B_X*B_Y/filterCacheF * filterPixels * numFilters]; +#define IA_PRELOAD_W_TX(z) wPreload[z] = tex1Dfetch(filters, filtersLoadOffset + (z) * B_X*B_Y/filterCacheF * filterPixels * numFilters); +#define IA_PRELOAD_H(y,x) if (!checkCaseBounds || myCaseIdx + (x) * B_X < numImages) { \ + hPreload[y][x] = hLoad[(y) * B_Y * numModules * numImages + (x) * B_X]; \ +} +#define IA_PRELOAD_H_TX(y,x) if (!checkCaseBounds || myCaseIdx + (x) * B_X < numImages) { \ + hPreload[y][x] = tex1Dfetch(hidActs, hidActsLoadOffset + (y) * B_Y * numModules * numImages + (x) * B_X); \ +} + +template +__global__ void +__launch_bounds__(256, 2) // 256 threads per block, 2 blocks per multiprocessor + // These launch bounds ensure 25% occupancy (128 registers used) + // as oppposed to 13% (130 registers) achieved by defaults. +conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObject_t hidActs, cudaTextureObject_t filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, + const int numImgColors, const int numGroups, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; + __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; + const int myCaseIdx = blockCaseIdx + threadIdx.x; + + const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; +// const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % B_X; + //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); + const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; + // nvcc is behaving idiotically again, these useless declarations save registers + //const int outputY = threadIdx.y, outputX = threadIdx.x; + //const int ty = threadIdx.y, tx = threadIdx.x; + const int numModules = numModulesY * numModulesX; + const int hidActsOffset = (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; + const int filtersOffset = blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; +// hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; +// filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + myCaseIdx; + + float prod[colorsPerThread][imgsPerThread]; + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; + //const bool noFLoop = filterCacheF == filterCacheH; + + /* + * Initial preload + */ + float hPreload[filterCacheH/B_Y][imgsPerThread]; // [2][4] + float wPreload[filterCacheF*colorsPerThread/B_X]; // [8] + + int moduleIdx, pxIdxInFilter; + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(startY, startX, numModulesX, paddingStart, moduleStride, blockPixelIdxY, + blockPixelIdxX, filterSize, moduleIdx, pxIdxInFilter); +// const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] +// : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + 0]; + int filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + 0 + : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters); + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { + if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { + wPreload[i * filterCacheF/(B_X*B_Y)] = tex1Dfetch(filters, filtersLoadOffset + i * filterPixels * numFilters); + } + } + +// const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages]; + int hidActsLoadOffset = hidActsOffset + (moduleIdx + 0 * numModules) * numImages; + #pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + hPreload[j/B_Y][i] = tex1Dfetch(hidActs, hidActsLoadOffset + j * numModules * numImages + i * B_X); + } + } + } + } + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; + const bool lastModule = my == endY - 1 && mx == endX - 1; + if (!lastModule) { + mxNext = mx + 1 == endX ? startX : mx + 1; + myNext = my + (mx + 1 == endX); + } + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(myNext, mxNext, numModulesX, paddingStart, moduleStride, blockPixelIdxY, + blockPixelIdxX, filterSize, moduleIdxNext, pxIdxInFilterNext); + for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { + if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { + shFilterLoad[i * filterCacheF] = wPreload[i * filterCacheF/(B_X*B_Y)]; + } + } + + filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + f + filterCacheF + : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f + filterCacheF); + if (f == numFiltersPerGroup - filterCacheF) { + filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilterNext * numFilters + : moduleIdxNext * numFilterColors * filterPixels * numFilters + pxIdxInFilterNext * numFilters); + } + + #pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; + } + } + } + } + + __syncthreads(); + + hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheH) * numModules) * numImages; + + #pragma unroll + for (int z = 0; z < 4; ++z) { + IA_PRELOAD_LOOP(z,0); + IA_PRELOAD_W_TX(z); + } + + #pragma unroll + for (int z = 4; z < 12; ++z) { + IA_PRELOAD_LOOP(z,0); + IA_PRELOAD_H_TX((z-4)/4,z%4); + } + + #pragma unroll + for (int z = 12; z < 16; ++z) { + IA_PRELOAD_LOOP(z,0); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; + } + } + } + } + + __syncthreads(); + + hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheF) * numModules) * numImages; + if (f == numFiltersPerGroup - filterCacheF) { + hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; + } + + #pragma unroll + for (int z = 0; z < 4; ++z) { + IA_PRELOAD_LOOP(z,filterCacheH); + IA_PRELOAD_W_TX(z+4); + } + + #pragma unroll + for (int z = 4; z < 12; ++z) { + IA_PRELOAD_LOOP(z,filterCacheH); + IA_PRELOAD_H_TX((z-4)/4, z%4); + } + + #pragma unroll + for (int z = 12; z < 16; ++z) { + IA_PRELOAD_LOOP(z,filterCacheH); + } + + __syncthreads(); + } + } + } + if (scale) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; + } + } + } + } +} + + +template +__global__ void +//__launch_bounds__(128, 3) // 128 threads per block, 3 blocks per multiprocessor +conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16(cudaTextureObject_t hidActs, cudaTextureObject_t filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, + const int numImgColors, const int numGroups, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; + __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; + const int myCaseIdx = blockCaseIdx + threadIdx.x; + + const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; +// const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % B_X; + //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); + const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; + // nvcc is behaving idiotically again, these useless declarations save registers + //const int outputY = threadIdx.y, outputX = threadIdx.x; + //const int ty = threadIdx.y, tx = threadIdx.x; + const int numModules = numModulesY * numModulesX; + + const int hidActsOffset = (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; + const int filtersOffset = blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + +// hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; +// filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + myCaseIdx; + + float prod[colorsPerThread][imgsPerThread]; + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; + //const bool noFLoop = filterCacheF == filterCacheH; + + /* + * Initial preload + */ + float hPreload[filterCacheH/B_Y][imgsPerThread]; // [4][4] + float wPreload[filterCacheF*colorsPerThread/B_X]; // [6] + + int moduleIdx, pxIdxInFilter; + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(startY, startX, numModulesX, paddingStart, moduleStride, blockPixelIdxY, + blockPixelIdxX, filterSize, moduleIdx, pxIdxInFilter); +// const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] +// : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + 0]; + int filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters); + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { + if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { + wPreload[i * filterCacheF/(B_X*B_Y)] = tex1Dfetch(filters, filtersLoadOffset + i * filterPixels * numFilters); + } + } + +// const float* hLoad = &hidActs[moduleIdx * numImages]; + int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages; + #pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + hPreload[j/B_Y][i] = tex1Dfetch(hidActs, hidActsLoadOffset + j * numModules * numImages + i * B_X); + } + } + } + } + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; + const bool lastModule = my == endY - 1 && mx == endX - 1; + if (!lastModule) { + mxNext = mx + 1 == endX ? startX : mx + 1; + myNext = my + (mx + 1 == endX); + } + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(myNext, mxNext, numModulesX, paddingStart, moduleStride, blockPixelIdxY, + blockPixelIdxX, filterSize, moduleIdxNext, pxIdxInFilterNext); + for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { + if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { + shFilterLoad[i * filterCacheF] = wPreload[i * filterCacheF/(B_X*B_Y)]; + } + } + + filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + f + filterCacheF + : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f + filterCacheF); + if (f == numFiltersPerGroup - filterCacheF) { + filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilterNext * numFilters + : moduleIdxNext * numFilterColors * filterPixels * numFilters + pxIdxInFilterNext * numFilters); + } + + #pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // NOTE: bank conflicts here! + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; + } + } + } + } + hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheF) * numModules) * numImages; + if (f == numFiltersPerGroup - filterCacheF) { + hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; + } + + __syncthreads(); + + // It seems that there is no point explicitly interleaving loads + // and computations because the scheduler does that anyway. + + IA_PRELOAD_LOOP2(0,0); + IA_PRELOAD_LOOP2(1,0); + IA_PRELOAD_LOOP2(2,0); + IA_PRELOAD_LOOP2(3,0); + IA_PRELOAD_LOOP2(4,0); + IA_PRELOAD_LOOP2(5,0); + IA_PRELOAD_LOOP2(6,0); + IA_PRELOAD_LOOP2(7,0); + IA_PRELOAD_LOOP2(8,0); + IA_PRELOAD_LOOP2(9,0); + IA_PRELOAD_LOOP2(10,0); + IA_PRELOAD_LOOP2(11,0); + IA_PRELOAD_LOOP2(12,0); + IA_PRELOAD_LOOP2(13,0); + IA_PRELOAD_LOOP2(14,0); + IA_PRELOAD_LOOP2(15,0); + + IA_PRELOAD_W_TX(0); + IA_PRELOAD_W_TX(1); + IA_PRELOAD_W_TX(2); + IA_PRELOAD_W_TX(3); + IA_PRELOAD_W_TX(4); + IA_PRELOAD_W_TX(5); + + IA_PRELOAD_H_TX(0,0); + IA_PRELOAD_H_TX(0,1); + IA_PRELOAD_H_TX(0,2); + IA_PRELOAD_H_TX(0,3); + IA_PRELOAD_H_TX(1,0); + IA_PRELOAD_H_TX(1,1); + IA_PRELOAD_H_TX(1,2); + IA_PRELOAD_H_TX(1,3); + IA_PRELOAD_H_TX(2,0); + IA_PRELOAD_H_TX(2,1); + IA_PRELOAD_H_TX(2,2); + IA_PRELOAD_H_TX(2,3); + IA_PRELOAD_H_TX(3,0); + IA_PRELOAD_H_TX(3,1); + IA_PRELOAD_H_TX(3,2); + IA_PRELOAD_H_TX(3,3); + + __syncthreads(); + } + } + } + if (scale) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; + } + } + } + } +} + +/* + * hidActs: (numFilters, numModules, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModules, numFilterColors, filterPixels, numFilters) otherwise + * targets: (overSample, numImgColors, imgPixels, numImages) + * + * Note: all of these convolution routines are optimized for the case when + * the number of images (i.e. the minibatch size) is a multiple of 128. + * Other batch sizes will work, but but I made no attempt whatsoever + * to make them work fast. + */ +void _imgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, + float scaleTargets, float scaleOutput, bool conv) { + int numFilterColors = numImgColors / numGroups; + int numImages = hidActs.getNumCols(); + int numFilters = filters.getNumCols(); + int numModules = hidActs.getNumRows() / numFilters; + int filterModuleMult = conv ? 1 : numModules; + int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors); + int filterSize = sqrt(filterPixels); + int imgPixels = imgSizeY * imgSizeX; + int numModulesX = numModules / numModulesY; + + megdnn_assert_internal(numImgColors % numGroups == 0); + //megdnn_assert_internal(numFilters % (16*numGroups) == 0); // TODO: insisting on 32 filters due to bug in calling code below. fix that. + bool previous_limit = (numFilters % (16 * numGroups)) == 0; + + megdnn_assert_internal(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0))); + megdnn_assert_internal(numGroups == 1 || numFilterColors % 4 == 0); + + megdnn_assert_internal(filterPixels == filterSize * filterSize); + megdnn_assert_internal(hidActs.getNumRows() == numModules * numFilters); + megdnn_assert_internal(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels); + megdnn_assert_internal(numModules == numModulesY * numModulesX); + + megdnn_assert_internal(hidActs.isContiguous()); + megdnn_assert_internal(filters.isContiguous()); + + megdnn_assert_internal(!hidActs.isTrans()); + megdnn_assert_internal(!filters.isTrans()); + megdnn_assert_internal(!targets.isTrans()); + // These routines don't handle the case when only part of the image is visited in the convolution + megdnn_assert_internal(paddingStart <= 0); + megdnn_assert_internal(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX); + megdnn_assert_internal(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY); + megdnn_assert_internal(moduleStride <= filterSize); + + megdnn_assert_internal(targets.isContiguous()); // no stride support here! + + dim3 blocks; + dim3 threads; + int colorsPerThread = 0, imgsPerThread = 0; + if (numFilterColors % 8 == 0) { + threads = dim3(32, numFilterColors % 64 == 0 ? 8 : 4); + colorsPerThread = numFilterColors % 64 == 0 ? 8 + : numFilterColors % 48 == 0 ? 12 + : numFilterColors % 32 == 0 ? 8 + : numFilterColors % 16 == 0 ? 4 + : 2; + imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1; + megdnn_assert_internal(numFilterColors % (threads.y * colorsPerThread) == 0); + //previous_limit = numFilterColors % (threads.y * colorsPerThread) == 0; + + blocks = dim3(DIVUP(numImages, threads.x*imgsPerThread) * (numImgColors/(threads.y*colorsPerThread)), imgPixels); + // NOTE: the case when channels % 32 == 0 but channels % 48 != 0 and channels % 64 != 0 has not been optimized!! + } else if (numFilterColors > 3) { + // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!! + imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; + threads = dim3(16, 16); + colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2; + blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread) * (numImgColors / colorsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4)); + } else { + // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!! + imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2; + threads = dim3(16, 16); + blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4)); + } + bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0; + + if (scaleTargets == 0) { // do not scale or use targets matrix + targets.resize(numImgColors*imgPixels, numImages); + } else { + megdnn_assert_internal(targets.getNumRows() == numImgColors * imgPixels); + megdnn_assert_internal(targets.getNumCols() == numImages); + } + const bool scale = scaleTargets != 0; +// cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared); +// conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, true ><<>>( +// hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, +// imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + + //return; +// printf("conv: %d\n", conv); +// printf("scale: %d\n", scale); +// printf("checkCaseBounds: %d\n", checkCaseBounds); +// printf("numFilterColors: %d\n", numFilterColors); +// printf("numImages: %d\n", numImages); +// cudaStream_t stream = NVMatrix::getDefaultStream(); + + if (conv == false) { + if (scale == false) { + if (checkCaseBounds == false) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, false ><<>>(hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + else if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 4, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 48 == 0) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4, 12, 16, 16, false, false, false ><<>>(hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 4, 12, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 2, 12, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + else if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 4, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 2, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 16 == 0) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 4, 4, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 2, 4, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 8 == 0) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 4, 2, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 2, 2, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + } + else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(img_acts_mediumcolor < 8, 4, false, false, false >, cudaFuncCachePreferShared); + img_acts_mediumcolor < 8, 4, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(img_acts_mediumcolor < 4, 4, false, false, false >, cudaFuncCachePreferShared); + img_acts_mediumcolor < 4, 4, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, false, false >, cudaFuncCachePreferShared); + img_acts_mediumcolor < 2, 4, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, false, false >, cudaFuncCachePreferShared); + img_acts_mediumcolor < 2, 4, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors == 2) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 8, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 8, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 4, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 4, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + } + else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 8, 3, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 8, 3, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 4, 3, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 4, 3, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 3, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 3, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors == 2) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 8, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 8, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 4, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 4, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 2, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors == 1) { + if ((numFilters % 1 == 0)) { + if (numImages % 128 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 8, 1, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 8, 1, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 64 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 4, 1, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 4, 1, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 32 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 1, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + else if (numImages % 16 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, false, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 1, false, false, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + } + } + else if (checkCaseBounds == true) { + if (numFilterColors % 8 == 0) { + if (numFilterColors % 64 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + else if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 16, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 48 == 0) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 12, 16, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 32 == 0) { + if (numFilters % 32 == 0) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 32, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + else if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 8, 16, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 16 == 0) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 4, 16, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors % 8 == 0) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, true, false >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler < 4, 32, 1, 2, 16, 16, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + } + else if (numFilterColors > 3) { + if (numFilterColors == 4) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(img_acts_mediumcolor < 2, 4, false, true, false >, cudaFuncCachePreferShared); + img_acts_mediumcolor < 2, 4, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + } + } + } + /* + else if (numFilterColors == 2) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, true, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 2, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + */ + } + else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 3, false, true, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 3, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors == 2) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 2, false, true, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 2, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors == 1) { + if ((numFilters % 1 == 0)) { + if (numImages % 1 == 0) { + cudaFuncSetCacheConfig(img_acts_color < 2, 1, false, true, false >, cudaFuncCachePreferShared); + img_acts_color < 2, 1, false, true, false ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput); + } + } + } + } + } + } + } + + getLastCudaError("imgActs: kernel execution failed"); +} + + +void convImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) { + _imgActs(stream, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, true); +} + +void convImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, + float scaleTargets, float scaleOutput) { + _imgActs(stream, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true); +} + +void localImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) { + _imgActs(stream, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, false); +} + +void localImgActs(cudaStream_t stream, NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, + int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups, + float scaleTargets, float scaleOutput) { + _imgActs(stream, hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false); +} + +} // namespace cuda +} // namespace megdnn + diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color.cuh b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color.cuh new file mode 100644 index 00000000..672bc1af --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color.cuh @@ -0,0 +1,221 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Block size: 16x16. + * blockIdx.x determines case in batches of 16*imgsPerThread. + * blockIdx.y determines 4x4 image region in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines pixel. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numColors, filterPixels, numFilters) otherwise + * targets: (numColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. + * + * Number of filters must be divisible by 16. + * Number of images must be divisible by 16*imgsPerThread if checkCaseBounds is false. + * 16 * imgsPerThread must be divisible by 32. + * + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads 16 weights at a time, so those aren't fully coalesced. + * This version conserves shared memory by loading 16 filters at a time rather than 32. + */ +template +__global__ void img_acts_color(const float* hidActs, const float* filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, + const int paddingStart, const int moduleStride, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shFilters[numColors*16][16 + 1]; + __shared__ float shHidActs[16][16*imgsPerThread]; + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int blockCaseIdx = blockIdx.x * 16*imgsPerThread; + const int numRegionsX = DIVUP(imgSizeX, 4); + const int blockRegionIdx = blockIdx.y; + const int blockRegionIdxX = blockRegionIdx % numRegionsX; + const int blockRegionIdxY = blockRegionIdx / numRegionsX; + const int blockRegionLeft = blockRegionIdxX * 4; + const int blockRegionTop = blockRegionIdxY * 4; + const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4; + const int pxY = blockRegionTop + pxYInRegion; + const int pxX = blockRegionLeft + pxXInRegion; + const int pxIdx = pxY * imgSizeX + pxX; + const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX; + const int numModules = numModulesY * numModulesX; + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeX * imgSizeY; + const int tidx = threadIdx.y * 16 + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + hidActs += blockCaseIdx + loadY * numImages * numModules + loadX; + filters += threadIdx.x; + targets += pxIdx * numImages + blockCaseIdx + threadIdx.x; + + + float prod[numColors][imgsPerThread]; + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + const int startY = blockRegionTop - paddingStart < filterSize ? 0 + : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride; + const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride); + const int startX = blockRegionLeft - paddingStart < filterSize ? 0 + : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride; + const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride); + + float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x]; + float* shHidActLoad = &shHidActs[loadY][loadX]; + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInModuleY = pxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInModuleX = pxX - moduleLeft; + + const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize; + const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX; + + for (int f = 0; f < numFilters; f += 16) { // multiply with 16 filters at a time + // Now the threads split up into half-warps, and each half-warp decides if it's interested. + const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; + #pragma unroll + for (int i = 0; i < imgsPerThread * 16; i += 32) { + if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) { + #pragma unroll + for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. + if (f + loadY + j < numFilters) { + shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; + } else { + shHidActLoad[j * 16 * imgsPerThread + i] = 0; + } + } + } else { + #pragma unroll + for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. + shHidActLoad[j * 16 * imgsPerThread + i] = 0; + } + } + } + + if (isPxInImg && isPxInModule) { + // This half-warp is interested, so it's going to load the weights from this module to its pixel. + // Not fully coalesced read :( + // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much. + const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f] + : &filters[(moduleIdx * numColors * filterPixels + pxIdxInModule) * numFilters + f]; + #pragma unroll + for (int c = 0; c < numColors; c++) { + if (f + threadIdx.x < numFilters) { + shilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters]; + } else { + shilterLoad[c * 16 * (16 + 1)] = 0; + } + } + + + } + + __syncthreads(); + // Do some actual computation + if (isPxInImg && isPxInModule) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int w = 0; w < 16; w++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16]; + } + } + } + } + __syncthreads(); + } + } + } + // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though + if (isPxInImg) { + if (scale) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i]; + } + } + } + } + } +} + +#define IMG_COLOR_K_HEAD template __global__ void img_acts_color +#define IMG_COLOR_K(scale, ckCase, conv) \ + IMG_COLOR_K_HEAD < 8, 2, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 4, 2, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 2, 2, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 8, 3, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 4, 3, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 2, 3, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 8, 1, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 4, 1, scale, ckCase, conv >(COLOR_KEP_PARAM); \ + IMG_COLOR_K_HEAD < 2, 1, scale, ckCase, conv >(COLOR_KEP_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ff.cu b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ff.cu new file mode 100644 index 00000000..fd4f7a9e --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ff.cu @@ -0,0 +1,40 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ff.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_color.cuh" + +namespace megdnn { +namespace cuda { + +IMG_COLOR_K(false, false, false) +//IMG_COLOR_K(false, false, true) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ft.cu b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ft.cu new file mode 100644 index 00000000..d412a0cf --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ft.cu @@ -0,0 +1,40 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_color_ft.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_color.cuh" + +namespace megdnn { +namespace cuda { + +IMG_COLOR_K(false, true, false) +//IMG_COLOR_K(false, true, true) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor.cuh b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor.cuh new file mode 100644 index 00000000..eaf13217 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor.cuh @@ -0,0 +1,192 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ + +/* + * Block size: B_YxB_X. + * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread. + * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread) + * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread) + * blockIdx.y determines image pixel in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines color. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise + * targets: (numImageColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. + * numFiltersPerGroup must be divisible by filterCache. + * + * B_X * imgsPerThread must be divisible by 32. + * numFilterColors must be divisible by B_Y*colorsPerThread. + * B_X*B_Y must be divisible by 32. + * filterCache must be divisible by B_X*B_Y/32 + * B_X*B_Y must be divisible by filterCache + + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads filterCache weights at a time, so those aren't fully coalesced (depending on size of filterCache). + * + * To be used when there are >= 16 color channels. + */ +template +__global__ void conv_img_acts_manycolor(const float* hidActs, const float* filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, + const int numImgColors, const int numGroups, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread*B_Y][filterCache + 1]; + __shared__ float shHidActs[filterCache][B_X*imgsPerThread]; + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; + + const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32; + const int filtersLoadY = tidx / filterCache, filtersLoadX = tidx % filterCache; + const int numModules = numModulesY * numModulesX; + + hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; + filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x; + + float prod[colorsPerThread][imgsPerThread]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX]; + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + + for (int f = 0; f < numFiltersPerGroup; f += filterCache) { // multiply with filterCache filters at a time + const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; + #pragma unroll + for (int i = 0; i < imgsPerThread * B_X; i += 32) { + if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) { + #pragma unroll + for (int j = 0; j < filterCache; j += B_X*B_Y/32) { // load filterCache rows of imgsPerThread*B_X cols, 8 * 32 elements at a time. + shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; + } + } else { + #pragma unroll + for (int j = 0; j < filterCache; j += B_X*B_Y/32) { // load filterCache rows of imgsPerThread*B_X cols, 8 * 32 elements at a time. + shHidActLoad[j * B_X * imgsPerThread + i] = 0; + } + } + } + const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f] + : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f]; + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCache) { + if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCache) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { + shFilterLoad[i * (filterCache + 1)] = fLoad[i * filterPixels * numFilters]; + } + } + + __syncthreads(); + // Do some actual computation + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + #pragma unroll + for (int w = 0; w < filterCache; w++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * shHidActs[w][threadIdx.x + i * B_X]; + } + } + } + __syncthreads(); + } + } + } + if (scale) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; + } + } + } + } +} diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler.cuh b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler.cuh new file mode 100644 index 00000000..b893653a --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler.cuh @@ -0,0 +1,264 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Block size: B_YxB_X. + * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread. + * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread) + * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread) + * blockIdx.y determines image pixel in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines color. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise + * targets: (numImageColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. + * numFiltersPerGroup must be divisible by filterCacheF. + * + * numFilterColors must be divisible by B_Y*colorsPerThread. + * B_X*B_Y must be divisible by filterCacheF + * filterCacheF must be divisible by filterCacheH + * + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads filterCacheF weights at a time, so those aren't fully coalesced (depending on size of filterCacheF). + * + * To be used when there are >= 16 color channels. + */ +template +__global__ void conv_img_acts_manycolor_kepler(const float* hidActs, const float* filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, + const int numImgColors, const int numGroups, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; + __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; + + const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * B_X + threadIdx.x; + const int hidActLoadY = threadIdx.y, hidActLoadX = threadIdx.x; + //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); + const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; + // nvcc is behaving idiotically again, these useless declarations save registers + //const int outputY = threadIdx.y, outputX = threadIdx.x; + //const int ty = threadIdx.y, tx = threadIdx.x; + const int numModules = numModulesY * numModulesX; + + hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; + filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x; + //bool active_t = filtersLoadX < numFilters; + + float prod[colorsPerThread][imgsPerThread]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX]; + //const bool noFLoop = filterCacheF == filterCacheH; + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + + for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time + const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f] + : &filters[(moduleIdx * numFilterColors * filterPixels + pxIdxInFilter) * numFilters + f]; + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { + if (((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || + i + filtersLoadY < colorsPerThread*B_Y) && + f + filtersLoadX < numFiltersPerGroup) { + shFilterLoad[i * filterCacheF] = fLoad[i * filterPixels * numFilters]; + } else { + shFilterLoad[i * filterCacheF] = 0; + + } + } + //#pragma unroll + for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) { + //conv_img_acts_manycolor_dummy_fhLoop(hidActs, shHidActLoad, shHidActs, shFilters, moduleIdx, numImages, hidActLoadY, hidActLoadX, blockCaseIdx, numModules, f, fh, prod); + + const float* hLoad = &hidActs[(moduleIdx + fh * numModules) * numImages]; + int hload_offset = blockFilterIdx + hidActLoadY + fh; + #pragma unroll + for (int j = 0; j < filterCacheH; j += B_Y) { + if (filterCacheH % B_Y == 0 || hidActLoadY + j < filterCacheH) { + #pragma unroll + for (int i = 0; i < imgsPerThread*B_X; i += B_X) { + if ((!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) + && hload_offset + j < numFilters) { + shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; + } else { + shHidActLoad[j * B_X * imgsPerThread + i] = 0; + } + } + } + } + __syncthreads(); + + // Do some actual computation + // Using these variables causes register usage to go from 161 --> 123. + // But nonetheless, the high-register version is faster. + //const float* shF = &shFilters[threadIdx.y][fh-f]; + //const float* const shF2 = &shFilters[threadIdx.y][fh]; + //const float* shH = &shHidActs[0][threadIdx.x]; + #pragma unroll + for (int w = 0; w < filterCacheH; w++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + // for test (checking result) + //float hid_val = shHidActs[w][threadIdx.x + i * B_X]; + //if (isnan(hid_val)) { + // hid_val = 0; + //} + prod[c][i] += shFilters[c * B_Y + threadIdx.y][fh-f + w] * shHidActs[w][threadIdx.x + i * B_X]; + + } + } + } + __syncthreads(); + + } + } + } + } + if (scale) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; + } + } + } + } +} + +#define IMG_MANY_COLOR_K_HEAD template __global__ void conv_img_acts_manycolor_kepler +#define IMG_MANY_COLOR_K(scale, ckCase, conv) \ + IMG_MANY_COLOR_K_HEAD< 8, 32, 4, 8, 32, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 8, 32, 2, 8, 32, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 8, 32, 1, 8, 32, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + \ + IMG_MANY_COLOR_K_HEAD< 8, 32, 4, 8, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 8, 32, 2, 8, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 8, 32, 1, 8, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 4, 12, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 2, 12, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 1, 12, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 4, 8, 32, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 2, 8, 32, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 1, 8, 32, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 4, 8, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 2, 8, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 1, 8, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 4, 4, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 2, 4, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 1, 4, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 4, 2, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 2, 2, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + IMG_MANY_COLOR_K_HEAD< 4, 32, 1, 2, 16, 16, scale, ckCase, conv > (MANYCOLOR_KEP_PARAM); \ + +// ftt +//< 8, 32, 1, 8, 32, 16, scale, conv, conv > +//< 8, 32, 1, 8, 16, 16, scale, conv, conv > +//< 4, 32, 1, 12, 16, 16, scale, conv, conv > +//< 4, 32, 1, 8, 32, 16, scale, conv, conv > +//< 4, 32, 1, 8, 16, 16, scale, conv, conv > +//< 4, 32, 1, 4, 16, 16, scale, conv, conv > +//< 4, 32, 1, 2, 16, 16, scale, conv, conv > + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_fff.cu b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_fff.cu new file mode 100644 index 00000000..027efb9a --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_fff.cu @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_fff.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_manycolor_kepler.cuh" +namespace megdnn { +namespace cuda { + +IMG_MANY_COLOR_K(false, false, false) + + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_ftf.cu b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_ftf.cu new file mode 100644 index 00000000..c90d94b2 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_ftf.cu @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_manycolor_kepler_ftf.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_manycolor_kepler.cuh" +namespace megdnn { +namespace cuda { + +IMG_MANY_COLOR_K(false, true, false) + + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cu b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cu new file mode 100644 index 00000000..5a010460 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cu @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_medium_color.cuh" + +namespace megdnn { +namespace cuda { + +IMG_MED_COLOR_K(false, false, false) +//IMG_MED_COLOR_K(false, false, true) +IMG_MED_COLOR_K(false, true, false) +//IMG_MED_COLOR_K(false, true, true) + +//IMG_MED_COLOR_K(true, false, false) +//IMG_MED_COLOR_K(true, false, true) +//IMG_MED_COLOR_K(true, true, false) +//IMG_MED_COLOR_K(true, true, true) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cuh b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cuh new file mode 100644 index 00000000..e99b4e62 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cuh @@ -0,0 +1,227 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_medium_color.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "img_act_templates.cuh" + +namespace megdnn { +namespace cuda { +/* + * Block size: 16x16. + * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread. + * In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread) + * blockIdx.x.y = 1..numImgColors/colorsPerThread + * blockIdx.y determines 4x4 image region in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines pixel. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise + * targets: (numImageColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. + * + * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false. + * 16 * imgsPerThread must be divisible by 32. + * numImageColors/numGroups must be divisible by colorsPerThread. + * + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads 16 weights at a time, so those aren't fully coalesced. + * This version conserves shared memory by loading 16 filters at a time rather than 32. + * + * To be used when there are 4-16 color channels. + */ +template +__global__ void img_acts_mediumcolor(const float* hidActs, const float* filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, + const int moduleStride, const int numImgColors, const int numGroups, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shFilters[colorsPerThread*16][16 + 1]; + __shared__ float shHidActs[16][16*imgsPerThread]; + fill_shared_mem((float *)shFilters, sizeof(shFilters)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int numImgBlocks = DIVUP(numImages,16*imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16*imgsPerThread; + + const int imgColorIdx = (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int numRegionsX = DIVUP(imgSizeX, 4); + const int blockRegionIdx = blockIdx.y; + const int blockRegionIdxX = blockRegionIdx % numRegionsX; + const int blockRegionIdxY = blockRegionIdx / numRegionsX; + const int blockRegionLeft = blockRegionIdxX * 4; + const int blockRegionTop = blockRegionIdxY * 4; + const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4; + const int pxY = blockRegionTop + pxYInRegion; + const int pxX = blockRegionLeft + pxXInRegion; + const int pxIdx = pxY * imgSizeX + pxX; + const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX; + const unsigned int numModules = numModulesY * numModulesX; + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + const int tidx = threadIdx.y * 16 + threadIdx.x; + const int loadY = tidx / 32, loadX = tidx % 32; + + hidActs += blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX; + filters += blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x; + targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages + blockCaseIdx + threadIdx.x; + + float prod[colorsPerThread][imgsPerThread]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + const int startY = blockRegionTop - paddingStart < filterSize ? 0 + : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride; + const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride); + const int startX = blockRegionLeft - paddingStart < filterSize ? 0 + : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride; + const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride); + + float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x]; + float* shHidActLoad = &shHidActs[loadY][loadX]; + + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInModuleY = pxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInModuleX = pxX - moduleLeft; + + const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize; + const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX; + + for (int f = 0; f < numFiltersPerGroup; f += 16) { // multipply with 16 filters at a time + // Now the threads split up into half-warps, and each half-warp decides if it's interested. + const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages]; + int hload_offset = blockFilterIdx + loadY + f; + #pragma unroll + for (int i = 0; i < imgsPerThread * 16; i += 32) { + if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) { + #pragma unroll + for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. + if (hload_offset + j < numFilters) { + shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i]; + } else { + shHidActLoad[j * 16 * imgsPerThread + i] = 0; + } + } + } else { + #pragma unroll + for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time. + shHidActLoad[j * 16 * imgsPerThread + i] = 0; + } + } + } + + if (isPxInImg && isPxInModule) { + // This half-warp is interested, so it's going to load the weights from this module to its pixel. + + // Not fully coalesced read :( + // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much. + const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f] + : &filters[(moduleIdx * numFilterColors * filterPixels + pxIdxInModule) * numFilters + f]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + if (blockFilterIdx + threadIdx.x + f < numFilters) { + shFilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters]; + } else { + shFilterLoad[c * 16 * (16 + 1)] = 0; + } + } + } + + __syncthreads(); + // Do some actual computation + if (isPxInImg && isPxInModule) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int w = 0; w < 16; w++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16]; + } + } + } + } + __syncthreads(); + } + } + } + // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though + if (isPxInImg) { + if (scale) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i]; + } + } + } + } + } +} + +#define IMG_MED_COLOR_K_HEAD template __global__ void img_acts_mediumcolor +#define IMG_MED_COLOR_K(scale, ckCase, conv) \ + IMG_MED_COLOR_K_HEAD< 8, 4, scale, ckCase, conv >(MED_COLOR_KEP_PARAM); \ + IMG_MED_COLOR_K_HEAD< 4, 4, scale, ckCase, conv >(MED_COLOR_KEP_PARAM); \ + IMG_MED_COLOR_K_HEAD< 2, 4, scale, ckCase, conv >(MED_COLOR_KEP_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_templates.cuh b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_templates.cuh new file mode 100644 index 00000000..11d0fdaf --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_templates.cuh @@ -0,0 +1,161 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/img_acts/img_act_templates.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "../nvmatrix.cuh" +#include "../cudaconv2.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +#define MANYCOLOR_KEP_PARAM const float* hidActs, \ + const float* filters, float* targets, \ + const int numModulesY, const int numModulesX, \ + const int numImages, const int numFilters, \ + const int filterSize, const int imgSizeY, \ + const int imgSizeX, const int paddingStart, \ + const int moduleStride, \ + const int numImgColors, const int numGroups, \ + const float scaleTargets, const float scaleOutputs + +/* + * Block size: B_YxB_X. + * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread. + * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread) + * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread) + * blockIdx.y determines image pixel in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines color. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise + * targets: (numImageColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases. + * + * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false. + * numFiltersPerGroup must be divisible by filterCacheF. + * + * numFilterColors must be divisible by B_Y*colorsPerThread. + * B_X*B_Y must be divisible by filterCacheF + * filterCacheF must be divisible by filterCacheH + * + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads filterCacheF weights at a time, so those aren't fully coalesced (depending on size of filterCacheF). + * + * To be used when there are >= 16 color channels. + */ +template +__global__ void conv_img_acts_manycolor_kepler(MANYCOLOR_KEP_PARAM); + + + +#define MED_COLOR_KEP_PARAM const float* hidActs, \ + const float* filters, float* targets, \ + const int numModulesY, const int numModulesX, \ + const int numImages, const int numFilters, \ + const int filterSize, \ + const int imgSizeY, const int imgSizeX, \ + const int paddingStart, const int moduleStride, \ + const int numImgColors, const int numGroups, \ + const float scaleTargets, const float scaleOutputs +/* + * Block size: 16x16. + * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread. + * In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread) + * blockIdx.x.y = 1..numImgColors/colorsPerThread + * blockIdx.y determines 4x4 image region in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines pixel. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numFilterColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise + * targets: (numImageColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. + * + * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false. + * 16 * imgsPerThread must be divisible by 32. + * numImageColors/numGroups must be divisible by colorsPerThread. + * + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads 16 weights at a time, so those aren't fully coalesced. + * This version conserves shared memory by loading 16 filters at a time rather than 32. + * + * To be used when there are 4-16 color channels. + */ +template +__global__ void img_acts_mediumcolor(MED_COLOR_KEP_PARAM); + + +#define COLOR_KEP_PARAM const float* hidActs, \ + const float* filters, float* targets, \ + const int numModulesY, const int numModulesX, \ + const int numImages, const int numFilters, \ + const int filterSize, \ + const int imgSizeY, const int imgSizeX, \ + const int paddingStart, const int moduleStride, \ + const float scaleTargets, const float scaleOutputs + +/* + * Block size: 16x16. + * blockIdx.x determines case in batches of 16*imgsPerThread. + * blockIdx.y determines 4x4 image region in target image. + * + * threadIdx.x determines case. + * threadIdx.y determines pixel. + * + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * filters: (numColors, filterPixels, numFilters) if conv + * (numModulesY, numModulesX, numColors, filterPixels, numFilters) otherwise + * targets: (numColors, imgSizeY, imgSizeX, numImages) + * + * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases. + * + * Number of filters must be divisible by 16. + * Number of images must be divisible by 16*imgsPerThread if checkCaseBounds is false. + * 16 * imgsPerThread must be divisible by 32. + * + * This version loads 32 cases at a time, so it gets full coalescing on that load. + * It only loads 16 weights at a time, so those aren't fully coalesced. + * This version conserves shared memory by loading 16 filters at a time rather than 32. + */ +template +__global__ void img_acts_color(COLOR_KEP_PARAM); + +} // namespace megdnn +} // namespace cuda diff --git a/dnn/src/cuda/local/cuda-convnet2/nvmatrix.cuh b/dnn/src/cuda/local/cuda-convnet2/nvmatrix.cuh new file mode 100644 index 00000000..0e8c66c7 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/nvmatrix.cuh @@ -0,0 +1,131 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/nvmatrix.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ + +#pragma once +#include "src/cuda/utils.cuh" +#include + +namespace megdnn { +namespace cuda { + +const int TEXTURE_SIZE_MAX = 1<<29; + +struct MemorySegment { + float *data; + MemorySegment(float *data): data(data) + {} +}; + +struct NVMatrix { + NVMatrix(MemorySegment *seg, int row, int col): + seg(seg), row(row), col(col), stride(col), _texObj(0) + { + } + NVMatrix(MemorySegment *seg, int row, int col, int stride): + seg(seg), row(row), col(col), stride(stride), _texObj(0) + { + } + float *getDevData() + { + return seg->data; + } + MemorySegment *seg; + int row, col, stride; + cudaTextureObject_t _texObj; + // target must be initialized before transpose. + void transpose(const NVMatrix &target, cublasHandle_t handle, + float *one, float *zero) + { + cublas_check(cublasSgeam(handle, + CUBLAS_OP_T, CUBLAS_OP_T, + row, col, + one, + seg->data, this->stride, + zero, + seg->data, this->stride, + target.seg->data, target.stride)); + } + cudaTextureObject_t getTextureObject() { + if (_texObj == 0) { + struct cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = getDevData(); + resDesc.res.linear.sizeInBytes = getNumDataBytes(); + resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + struct cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + cuda_check(cudaCreateTextureObject(&_texObj, &resDesc, &texDesc, NULL)); + } + megdnn_assert_internal(_texObj != 0); + return _texObj; + } + ~NVMatrix() + { + if (_texObj) { + cuda_check(cudaDestroyTextureObject(_texObj)); + } + } + int getNumDataBytes() + { + return row * col * sizeof(float); + } + int getNumRows() + { + return row; + } + int getNumCols() + { + return col; + } + int getStride() + { + return stride; + } + bool isTrans() + { + return false; + } + bool isContiguous() + { + return true; + } + void resize(int row, int col) + { + megdnn_assert_internal(row * col == this->row * this->col); + this->row = row; + this->col = col; + } +}; + +} +} diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts.cu new file mode 100644 index 00000000..99fe7465 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts.cu @@ -0,0 +1,1708 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ + +#include "cudaconv2.cuh" + +#include "nvmatrix.cuh" +#include "weight_acts/wet_act_templates.cuh" +#include + +#ifdef _WIN32 +#define _Pragma(x) +#endif + +namespace megdnn { +namespace cuda { + +__device__ __forceinline__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + const int my, const int mx, const int paddingStart, const int numModulesX, const int moduleStride, + const int blockPixelY, const int blockPixelX, const int imgSizeX, + const int imgStride, int& pixIdx, int& m) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image + const int pxX = imgLoadModPosX + blockPixelX; + pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image + m = my * numModulesX + mx; +} + + +#define WA_C3_LOOP(pp, c) _Pragma("unroll") \ +for (int i = 0; i < preloadCases; i++) { \ + _Pragma("unroll") \ + for (int p = 0; p < pixelCache; p++) { \ + _Pragma("unroll") \ + for (int f = 0; f < filtersPerThread; f++) { \ + prod[c][(pp) + p][f] += shImages[threadIdx.y + p * B_Y + (c) * pixelCache * B_Y][i] * shHidActs[threadIdx.x * filtersPerThread + f][i]; \ + } \ + } \ +} + +#define WA_C3_LOOP2(pp) _Pragma("unroll") \ +for (int p = 0; p < pixelCache; p++) { \ + _Pragma("unroll") \ + for (int i = 0; i < preloadCases; i++) { \ + _Pragma("unroll") \ + for (int f = 0; f < filtersPerThread; f++) { \ + _Pragma("unroll") \ + for (int c = 0; c < 3; ++c) { \ + prod[c][(pp) + p][f] += shImages[threadIdx.y + p * B_Y + (c) * pixelCache * B_Y][i] * shHidActs[threadIdx.x * filtersPerThread + f][i]; \ + } \ + } \ + } \ +} + +#define WA_3_FIDX(y) (((loadY + (y)*B_X*B_Y/preloadCases) % filtersPerThread) * B_X + (loadY + (y)*B_X*B_Y/preloadCases) / filtersPerThread) + + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters + * threadIdx.x determines filter + * threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... + * so the compiler is messing up here somehow. It's unable to optimize that case away. + */ +template +//__launch_bounds__(256,2) +__global__ void conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels + __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X*filtersPerThread); + + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % numFilterBlocks); + +// const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + const int imgOffset = loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX; +// images += loadX; +// hidActs += blockFilterIdx * numImages * numModules +// + loadX; + + targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + //float* shImgLoad = &shImages[loadY][loadX]; + //float* shHidActLoad = &shHidActs[loadY][loadX]; + + float prod[numColors][pixelsPerThread][filtersPerThread]; + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + const int mStartX = blockModuleStartX; + const int mStartY = blockModuleStartY; + const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); + const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); + + const bool doWork = mStartY < mEndY && mStartX < mEndX; +// if (!doWork) { +// hidActs -= +// } +// if (mStartY == mEndY || mStartX == mEndX) { +// return; +// } + +// float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12] + float haPreload[filtersPerThread * preloadCases / B_Y]; // [8] +// if (blockIdx.x != 0 || blockIdx.y !=0) { +// return; +// } +// printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX, mStartY, mEndX, mEndY); + const int fYOff = (blockPixelOffset + tidx) / filterSize; + const int fXOff = (blockPixelOffset + tidx) % filterSize; + __shared__ int pxIdxes[B_Y*pixelsPerThread]; + fill_shared_mem((int *)pxIdxes, sizeof(pxIdxes)/sizeof(int), 0); + __syncthreads(); +// __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [8] + + int m = mStartY * numModulesX + mStartX; + + int fidx[filtersPerThread * preloadCases / B_Y]; + if (doWork) { + #pragma unroll + for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) { + const int fIdx = WA_3_FIDX(y); +// if (doWork) { + haPreload[y] = tex1Dfetch(hidActs, hidActsOffset + fIdx * numImages * numModules + m * numImages); +// } + fidx[y] = fIdx * numImages * numModules; + } + } + + for (int my = mStartY; my < mEndY; my++) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + for (int mx = mStartX; mx < mEndX; mx++) { + m = my * numModulesX + mx; + +// __syncthreads(); + const int imgLoadModPosX = paddingStart + mx * moduleStride; + if (tidx < B_Y * pixelsPerThread) { +// const int imgLoadModPosY = paddingStart + my * moduleStride; +// const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxY = (imgLoadModPosY + fYOff); + const int pxX = (imgLoadModPosX + fXOff); + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; + } + __syncthreads(); + + int myNext = my, mxNext = mx, mNext = m; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + mNext = myNext * numModulesX + mxNext; + } + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + const bool lastBatch = caseIdx + preloadCases == numImages; +// const float* im = &images[caseIdx + preloadCases + pixIdx]; +// const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; + + if (lastBatch) { +// ha = &hidActs[mNext * numImages]; + hidActsOffset2 = hidActsOffset + mNext * numImages; + } + + #pragma unroll + for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { + shHidActs[loadY+y][loadX] = haPreload[y*preloadCases/(B_X*B_Y)]; + } + + /* ================================================================================== + * Iteration 0 + * ================================================================================== + */ + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; + } + } + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter + if (pxIdx + blockPixelOffset < filterPixels) { + const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; + if (pixIdx >= 0) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = tex1Dfetch(images, imgOffset + caseIdx + c * imgPixels * imgStride + pixIdx); + } + } + } + } + + __syncthreads(); + + haPreload[0] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[0]); + haPreload[1] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[1]); + WA_C3_LOOP(0,0); + haPreload[2] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[2]); + haPreload[3] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[3]); + WA_C3_LOOP(0,1); + haPreload[4] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[4]); + haPreload[5] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[5]); + WA_C3_LOOP(0,2); + haPreload[6] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[6]); + haPreload[7] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[7]); + + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { +// if (threadIdx.x == 3) + targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters + * threadIdx.x determines filter + * threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... + * so the compiler is messing up here somehow. It's unable to optimize that case away. + */ +template +__launch_bounds__(256,2) +__global__ void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels + __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X*filtersPerThread); + + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % numFilterBlocks); + +// const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + const int imgOffset = loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + loadX; +// images += loadX; +// hidActs += blockFilterIdx * numImages * numModules +// + loadX; + + targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + //float* shImgLoad = &shImages[loadY][loadX]; + //float* shHidActLoad = &shHidActs[loadY][loadX]; + + float prod[numColors][pixelsPerThread][filtersPerThread]; + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + const int mStartX = blockModuleStartX; + const int mStartY = blockModuleStartY; + const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); + const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); + + const bool doWork = mStartY < mEndY && mStartX < mEndX; +// if (mStartY == mEndY || mStartX == mEndX) { +// return; +// } + +// float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12] + float haPreload[filtersPerThread * preloadCases / B_Y]; // [6] +// if (blockIdx.x != 0 || blockIdx.y !=0) { +// return; +// } +// printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX, mStartY, mEndX, mEndY); + const int fYOff = (blockPixelOffset + tidx) / filterSize; + const int fXOff = (blockPixelOffset + tidx) % filterSize; + __shared__ int pxIdxes[B_Y*pixelsPerThread]; + fill_shared_mem((int *)pxIdxes, sizeof(pxIdxes)/sizeof(int), 0); + __syncthreads(); +// __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [6] + + int m = mStartY * numModulesX + mStartX; + int fidx[filtersPerThread * preloadCases / B_Y]; +// if (doWork) { + #pragma unroll + for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) { + fidx[y] = WA_3_FIDX(y) * numImages * numModules; + if (doWork) { // Not actually necessary, I think + haPreload[y] = tex1Dfetch(hidActs, hidActsOffset + fidx[y] + m * numImages); + } + } +// } + int mNext = mStartY * numModulesX + mStartX; + for (int my = mStartY; my < mEndY; my++) { +// const int imgLoadModPosY = paddingStart + my * moduleStride; + for (int mx = mStartX; mx < mEndX; mx++) { + m = mNext;//my * numModulesX + mx; + +// __syncthreads(); +// const int imgLoadModPosX = paddingStart + mx * moduleStride; + if (tidx < B_Y * pixelsPerThread) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxY = (imgLoadModPosY + fYOff); + const int pxX = (imgLoadModPosX + fXOff); + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; + } + __syncthreads(); + + + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + mNext = lastModule * m + !lastModule * ((my + (mx + 1 == mEndX)) * numModulesX + (mx + 1 == mEndX ? mStartX : mx + 1)); +// if (!lastModule) { +// const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1; +// const int myNext = my + (mx + 1 == mEndX); +// mNext = myNext * numModulesX + mxNext; +// } + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + const bool lastBatch = caseIdx + preloadCases == numImages; +// const float* im = &images[caseIdx + preloadCases + pixIdx]; +// const float* ha = hidActs + !lastBatch * (caseIdx + preloadCases + m * numImages) + lastBatch * mNext * numImages; + const int hidActsOffset2 = hidActsOffset + !lastBatch * (caseIdx + preloadCases + m * numImages) + lastBatch * mNext * numImages; +// if (lastBatch) { +// ha = &hidActs[mNext * numImages]; +// } + + #pragma unroll + for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { + shHidActs[loadY+y][loadX] = haPreload[y*preloadCases/(B_X*B_Y)]; + } + + /* ================================================================================== + * Iteration 0 + * ================================================================================== + */ + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; + } + } + } + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { + const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter + const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; + if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = tex1Dfetch(images, imgOffset + caseIdx + c * imgPixels * imgStride + pixIdx); + } + } + } + } + + __syncthreads(); + + haPreload[0] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[0]); + haPreload[1] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[1]); + haPreload[2] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[2]); + haPreload[3] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[3]); + haPreload[4] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[4]); + haPreload[5] = tex1Dfetch(hidActs, hidActsOffset2 + fidx[5]); + + WA_C3_LOOP2(0); + + __syncthreads(); + + /* ================================================================================== + * Iteration 1 + * ================================================================================== + */ + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { +// const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; + } + } + } + + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { + const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter + const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; + if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = tex1Dfetch(images, imgOffset + caseIdx + c * imgPixels * imgStride + pixIdx); + } + } + } + } + + __syncthreads(); + + WA_C3_LOOP2(2); + + __syncthreads(); + + } + } + } + + if (scale) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + */ +template +__launch_bounds__(128, 4) +__global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int numImgColors, const int numGroups, const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases + __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + +// const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; +// images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + + loadX; +// +// hidActs += +// blockFilterIdx * numImages * numModules +// + loadY * numImages * numModules +// + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.x; +// if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + +// if (mStartY == mEndY || mStartX == mEndX) { +// return; +// } +// const bool doWork = mStartY < mEndY && mStartX < mEndX; + + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + + float imPreload[preloadCases*colorsPerThread/B_X]; // [8] + float haPreload[preloadCases*filtersPerThread/B_Y]; // [8] + + float prod[filtersPerThread][colorsPerThread]; + + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[f][c] = 0; + } + } + int pixIdx, pixIdxNext, m, mNext; + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + mStartY, mStartX, paddingStart, numModulesX, moduleStride, + blockPixelY, blockPixelX, imgSizeX, imgStride, + pixIdx, m); + + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + // It's bizarre, but this is the fastest way I've found to get it not to load nonexistent pixels. + // All other ways cause crazy excessive register usage. + const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * imgPixels * imgStride + pixIdx); + imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + idx); + } + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Almost certainly not necessary here. + const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * numImages * numModules + m * numImages); + haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + idx); + } + + + for (int my = mStartY; my < mEndY; my++) { + for (int mx = mStartX; mx < mEndX; mx++) { + int myNext = my, mxNext = mx; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + } + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + myNext, mxNext, paddingStart, numModulesX, moduleStride, + blockPixelY, blockPixelX, imgSizeX, imgStride, + pixIdxNext, mNext); + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)]; + } +// const float* im = &images[caseIdx + preloadCases + pixIdx]; +// const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; + int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; + if (caseIdx + preloadCases == numImages) { + pixIdx = pixIdxNext; + m = mNext; + imgOffset2 = imgOffset + pixIdxNext; + hidActsOffset2 = hidActsOffset + mNext * numImages; + } + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)]; + } + + __syncthreads(); + + #pragma unroll + for (int z = 0; z < 8; ++z) { + WA_IMLOAD_TX(z); + WA_LOOP2(z); + } + + #pragma unroll + for (int z = 0; z < 8; ++z) { + WA_HALOAD_TX(z); + WA_LOOP2(z+8); + } + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c]; + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c]; + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + */ +template +__launch_bounds__(256, 2) +__global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int numImgColors, const int numGroups, const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases + __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + +// const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + + const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + + loadX; +// images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; +// +// hidActs += +// blockFilterIdx * numImages * numModules +// + loadY * numImages * numModules +// + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.x; +// if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + +// if (mStartY == mEndY || mStartX == mEndX) { +// return; +// } + const bool doWork = mStartY < mEndY && mStartX < mEndX; + + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + + float imPreload[preloadCases*colorsPerThread/B_X]; // [6] + float haPreload[preloadCases*filtersPerThread/B_Y]; // [16] + + float prod[filtersPerThread][colorsPerThread]; + + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[f][c] = 0; + } + } + int pixIdx, pixIdxNext, m, mNext; + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + mStartY, mStartX, paddingStart, numModulesX, moduleStride, + blockPixelY, blockPixelX, imgSizeX, imgStride, + pixIdx, m); + + if (doWork) { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + y * imgPixels * imgStride + pixIdx); + } + + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + y * numImages * numModules + m * numImages); + } + } +// if (mStartY > mEndY || mStartX > mEndX) { +// printf("crzy!!\n"); +// } + + for (int my = mStartY; my < mEndY; my++) { + for (int mx = mStartX; mx < mEndX; mx++) { + int myNext = my, mxNext = mx; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + } + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + myNext, mxNext, paddingStart, numModulesX, moduleStride, + blockPixelY, blockPixelX, imgSizeX, imgStride, + pixIdxNext, mNext); + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)]; + } + + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)]; + } + + __syncthreads(); + +// const float* im = &images[caseIdx + preloadCases + pixIdx]; +// const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; + int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; + if (caseIdx + preloadCases == numImages) { + pixIdx = pixIdxNext; + m = mNext; + imgOffset2 = imgOffset + pixIdxNext; + hidActsOffset2 = hidActsOffset + mNext * numImages; + } + + WA_LOOP(0); + WA_LOOP(1); + WA_LOOP(2); + WA_LOOP(3); + WA_LOOP(4); + + WA_LOOP(5); + WA_IMLOAD_TX(0); + WA_LOOP(6); + WA_IMLOAD_TX(1); + WA_LOOP(7); + WA_IMLOAD_TX(2); + WA_LOOP(8); + WA_IMLOAD_TX(3); + WA_LOOP(9); + WA_IMLOAD_TX(4); + WA_LOOP(10); + WA_IMLOAD_TX(5); + + WA_LOOP(11); + WA_HALOAD_TX(0); + WA_LOOP(12); + WA_HALOAD_TX(1); + WA_LOOP(13); + WA_HALOAD_TX(2); + WA_LOOP(14); + WA_HALOAD_TX(3); + WA_LOOP(15); + WA_HALOAD_TX(4); + WA_LOOP(16); + WA_HALOAD_TX(5); + WA_LOOP(17); + WA_HALOAD_TX(6); + WA_LOOP(18); + WA_HALOAD_TX(7); + WA_LOOP(19); + WA_HALOAD_TX(8); + WA_LOOP(20); + WA_HALOAD_TX(9); + WA_LOOP(21); + WA_HALOAD_TX(10); + WA_LOOP(22); + WA_HALOAD_TX(11); + WA_LOOP(23); + WA_HALOAD_TX(12); + WA_LOOP(24); + WA_HALOAD_TX(13); + WA_LOOP(25); + WA_HALOAD_TX(14); + WA_LOOP(26); + WA_HALOAD_TX(15); + + WA_LOOP(27); + WA_LOOP(28); + WA_LOOP(29); + WA_LOOP(30); + WA_LOOP(31); + + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c]; + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c]; + } + } + } +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + */ +template +__launch_bounds__(256, 2) +__global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cudaTextureObject_t images, cudaTextureObject_t hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int numImgColors, const int numGroups, const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases + __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + +// const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; +// images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int hidActsOffset = blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + + loadX; +// +// hidActs += +// blockFilterIdx * numImages * numModules +// + loadY * numImages * numModules +// + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.x; +// if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + + const bool doWork = mStartY < mEndY && mStartX < mEndX; + + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + + float imPreload[preloadCases*colorsPerThread/B_X]; // [4] + float haPreload[preloadCases*filtersPerThread/B_Y]; // [8] + + float prod[filtersPerThread][colorsPerThread]; + + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[f][c] = 0; + } + } + int pixIdx, pixIdxNext, m, mNext; + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + mStartY, mStartX, paddingStart, numModulesX, moduleStride, + blockPixelY, blockPixelX, imgSizeX, imgStride, + pixIdx, m); + + if (doWork && loadY < B_Y * colorsPerThread) { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + y * imgPixels * imgStride + pixIdx); + } + } + + if (doWork && loadY < B_X * filtersPerThread) { + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + y * numImages * numModules + m * numImages); + } + } + + for (int my = mStartY; my < mEndY; my++) { + for (int mx = mStartX; mx < mEndX; mx++) { + int myNext = my, mxNext = mx; + const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; + + if (!lastModule) { + mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + myNext = my + (mx + 1 == mEndX); + } + + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + myNext, mxNext, paddingStart, numModulesX, moduleStride, + blockPixelY, blockPixelX, imgSizeX, imgStride, + pixIdxNext, mNext); + + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + +// const float* im = &images[caseIdx + preloadCases + pixIdx]; + int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; + int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; +// const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + + if (caseIdx + preloadCases == numImages) { + pixIdx = pixIdxNext; + m = mNext; +// im = &images[pixIdxNext]; + imgOffset2 = imgOffset + pixIdxNext; + hidActsOffset2 = hidActsOffset + mNext * numImages; + +// ha = &hidActs[mNext * numImages]; + } + + if (loadY < B_Y * colorsPerThread) { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)]; + } + } + + if (loadY < B_X * filtersPerThread) { + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)]; + } + } + + __syncthreads(); + + WA_LOOP(0); + WA_IMLOAD_TX(0); + WA_LOOP(1); + WA_IMLOAD_TX(1); + WA_LOOP(2); + WA_IMLOAD_TX(2); + WA_LOOP(3); + WA_IMLOAD_TX(3); + WA_LOOP(4); + WA_HALOAD_TX(0); + WA_LOOP(5); + WA_HALOAD_TX(1); + WA_LOOP(6); + WA_HALOAD_TX(2); + WA_LOOP(7); + WA_HALOAD_TX(3); + WA_LOOP(8); + WA_HALOAD_TX(4); + WA_LOOP(9); + WA_HALOAD_TX(5); + WA_LOOP(10); + WA_HALOAD_TX(6); + WA_LOOP(11); + WA_HALOAD_TX(7); + WA_LOOP(12); + WA_LOOP(13); + WA_LOOP(14); + WA_LOOP(15); + + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c]; + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c]; + } + } + } +} + +std::pair getWeightActsOutputSize(int numModulesY, int numModulesX, int numFilterColors, + int filterSize, int numFilters, int sumWidth) { + const int outputModuleChunksX = DIVUP(numModulesX, sumWidth); + const int outputModuleChunksY = DIVUP(numModulesY, sumWidth); + const int outputModuleChunks = outputModuleChunksX * outputModuleChunksY; + return std::pair(outputModuleChunks * numFilterColors * filterSize * filterSize, numFilters); +} + +/* + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModules, numImages) + * + * targets: (numModuleY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + * + * TODO: you can get a slight speed boost for local non-convolutional units by writing special + * routines for partialSum = 1. But I dunno if the code duplication is worth it... + * + * Note: all of these convolution routines are optimized for the case when + * the number of images (i.e. the minibatch size) is a multiple of 128. + * Other batch sizes will work, but but I made no attempt whatsoever + * to make them work fast. + */ +void _weightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, + int numGroups, int sumWidth, float scaleTargets, float scaleOutput) { + int numFilterColors = numImgColors / numGroups; + int imgStride = images.getStride(); + int numImages = images.getNumCols(); + int imgPixels = images.getNumRows() / numImgColors; + int imgSizeX = imgPixels / imgSizeY; + int numModules = numModulesY * numModulesX; + int numFilters = hidActs.getNumRows() / numModules; + int numFiltersPerGroup = numFilters / numGroups; + + megdnn_assert_internal(numImgColors % numGroups == 0); + //megdnn_assert_internal(numFilters % (16*numGroups) == 0); + bool previous_limit = numFilters % (16*numGroups) == 0; + + megdnn_assert_internal(numGroups > 1 || (numImgColors > 0 /*&& (numImgColors <= 3 || numImgColors % 16 == 0)*/)); + previous_limit &= numImgColors % 16 == 0; + megdnn_assert_internal(numGroups == 1 || numFilterColors % 16 == 0); + + megdnn_assert_internal(imgSizeY * imgSizeX == imgPixels); + megdnn_assert_internal(images.getNumRows() == imgPixels * numImgColors); + + int filterPixels = filterSize * filterSize; + int outputModuleChunksX = DIVUP(numModulesX, sumWidth); + int outputModuleChunksY = DIVUP(numModulesY, sumWidth); + int outputModuleChunks = outputModuleChunksX * outputModuleChunksY; +// partialSum = partialSum == 0 ? numModules : partialSum; + +// megdnn_assert_internal(numModules % partialSum == 0); + megdnn_assert_internal(hidActs.getNumCols() == numImages); + + // These routines don't handle the case when only part of the image is visited in the convolution + megdnn_assert_internal(paddingStart <= 0); + megdnn_assert_internal(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX); + megdnn_assert_internal(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY); + megdnn_assert_internal(moduleStride <= filterSize); + + megdnn_assert_internal(numModules * numFilters == hidActs.getNumRows()); + + megdnn_assert_internal(!images.isTrans()); + megdnn_assert_internal(!hidActs.isTrans()); + megdnn_assert_internal(hidActs.isContiguous()); + + megdnn_assert_internal(!targets.isTrans()); + megdnn_assert_internal(targets.isContiguous()); + + int preloadCases = 32; + + dim3 blocks, threads; + int bx, by; + int pixelsPerThread = 0, filtersPerThread = 0, colorsPerThread = 0; + // Worth playing with these parameters to find best values for your problem. + // These values work relatively well, but not optimal for all problems. + if (numFilterColors > 3) { + filtersPerThread = numFiltersPerGroup % 64 == 0 ? 4 + : numFiltersPerGroup % 32 == 0 ? 2 + : 1; + colorsPerThread = numFilterColors % 64 == 0 ? 8 + : numFilterColors % 48 == 0 ? 6 + : numFilterColors % 32 == 0 ? 8 + : 4; + by = (numFilterColors / colorsPerThread) % 8 == 0 ? 8 : 4; + bx = numFiltersPerGroup % 128 == 0 ? 32 : 16; + preloadCases = filtersPerThread * colorsPerThread < 32 ? 32 : 16; + blocks = dim3(outputModuleChunks * DIVUP(numFilters,bx*filtersPerThread), DIVUP(numFilterColors, (by*colorsPerThread)), filterPixels); + + //megdnn_assert_internal(numFilterColors % (by*colorsPerThread) == 0); + previous_limit &= numFilterColors % (by*colorsPerThread) == 0; + + } else { // This is ugly but it's nice to spell it out clearly + megdnn_assert_internal(numGroups == 1); // Just for sanity + // NOTE: these things are only optimized for colors = 3. I didn't really test other cases. + if (numFilters % 64 == 0) { // TODO: having a separate case for 128 would make things faster, but I probably don't care about 128 + filtersPerThread = 4; + pixelsPerThread = 2; + by = 16; + bx = 16; + preloadCases = 32; + } else if (numFilters % 48 == 0) { + filtersPerThread = 3; + pixelsPerThread = 4; + by = 16; + bx = 16; + preloadCases = 32; + } else if (numFilters % 32 == 0) { + filtersPerThread = 2; + pixelsPerThread = 2; + by = 8; + bx = 16; + preloadCases = 16; + } else { // This case is completely untested. It might be really slow. But no time now. + filtersPerThread = 1; + pixelsPerThread = 16; + by = 16; + bx = 16; + preloadCases = 32; + } + blocks = dim3(outputModuleChunks * DIVUP(numFilters,bx*filtersPerThread), DIVUP(filterPixels, by*pixelsPerThread)); + } + megdnn_assert_internal((by * bx) % preloadCases == 0); + //megdnn_assert_internal(numFilters % (bx * filtersPerThread) == 0); + previous_limit &= numFilters % (bx * filtersPerThread) == 0; + + threads = dim3(bx, by); + bool checkCaseBounds = numImages % preloadCases != 0; + bool scale = scaleTargets != 0; + std::pair targetSize = getWeightActsOutputSize(numModulesY, numModulesX, numFilterColors, filterSize, numFilters, sumWidth); + if (!scale) { + targets.resize(targetSize.first, targetSize.second); + } else { + megdnn_assert_internal(targets.getNumRows() == targetSize.first); + megdnn_assert_internal(targets.getNumCols() == targetSize.second); + } + + if (scale == false) { + if (checkCaseBounds == false) { + if (numFilterColors > 3) { + if (numFilterColors % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< 8, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16< 8, 32, 4, 8, 16, false ><<>>(images.getTextureObject(), hidActs.getTextureObject(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFiltersPerGroup % 64 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16< 8, 16, 4, 8, 16, false ><<>>(images.getTextureObject(), hidActs.getTextureObject(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors % 48 == 0) { + if (numFiltersPerGroup % 128 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< 8, 32, 4, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32< 8, 32, 4, 6, 32, false ><<>>(images.getTextureObject(), hidActs.getTextureObject(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFiltersPerGroup % 64 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3 < 16, 16, 2, 2, 4, 32, 3, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3 < 16, 16, 2, 2, 4, 32, 3, false, false ><<>>(images.getTextureObject(), hidActs.getTextureObject(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFiltersPerGroup % 48 == 0) { + if (previous_limit) { + cudaFuncSetCacheConfig(conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3 < 16, 16, 2, 4, 3, 32, 3, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3 < 16, 16, 2, 4, 3, 32, 3, false, false ><<>>(images.getTextureObject(), hidActs.getTextureObject(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } else { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw< 16, 16, 2, 4, 3, 32, 3, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors == 2) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors == 1) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, false >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, false >,cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + } + } + else if (checkCaseBounds == true) { + if (numFilterColors > 3) { + if (numFilterColors % 64 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors % 48 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 32, 4, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 4, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 2, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 8, 16, 1, 6, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors % 32 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 8, 16, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 8, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors % 1 == 0) { + if (numFiltersPerGroup % 128 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 32, 4, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 4, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 2, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false >, cudaFuncCachePreferShared); + conv_weight_acts_mc_mf_kepler_sw < 4, 16, 1, 4, 32, false ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, numImgColors, numGroups, sumWidth, scaleTargets, scaleOutput); + } + } + } + else if (numFilterColors <= 3) { + if (numFilterColors == 3) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 3, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 3, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 3, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 3, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors == 2) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 2, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 2, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 2, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 2, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + else if (numFilterColors == 1) { + if (numFiltersPerGroup % 64 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 2, 4, 32, 1, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 48 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 4, 3, 32, 1, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 32 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 8, 16, 2, 2, 2, 16, 1, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + else if (numFiltersPerGroup % 1 == 0) { + cudaFuncSetCacheConfig(conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, true >, cudaFuncCachePreferShared); + conv_weight_acts_c_kepler_sw < 16, 16, 2, 16, 1, 32, 1, false, true ><<>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, sumWidth, scaleTargets, scaleOutput); + } + } + } + } + } + + getLastCudaError("weightActs: kernel execution failed"); +} + +void convWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum) { + _weightActs(stream, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, 0, 1); +} + +void convWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum, + float scaleTargets, float scaleOutput) { + _weightActs(stream, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput); +} + +void localWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups) { + _weightActs(stream, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1, 0, 1); +} + +void localWeightActs(cudaStream_t stream, NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, + int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, + int numImgColors, int numGroups, float scaleTargets, float scaleOutput) { + _weightActs(stream, images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1, + scaleTargets, scaleOutput); +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ff.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ff.cu new file mode 100644 index 00000000..89ed3cee --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ff.cu @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ff.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, false, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ft.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ft.cu new file mode 100644 index 00000000..be22458b --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ft.cu @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_1_ft.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 1, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ff.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ff.cu new file mode 100644 index 00000000..ff4a5eb2 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ff.cu @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ff.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, false, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, true, true > (C_KEP_SW_PARAM); + + + // instead of preload + WET_ACT_C_KEPLER_SW_HEAD<16, 16, 2, 2, 4, 32, 3, false, false> (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD<16, 16, 2, 4, 3, 32, 3, false, false> (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ft.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ft.cu new file mode 100644 index 00000000..339bec62 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ft.cu @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_2_ft.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 2, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ff.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ff.cu new file mode 100644 index 00000000..5af43a1e --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ff.cu @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ff.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, false, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ft.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ft.cu new file mode 100644 index 00000000..fd6cd284 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ft.cu @@ -0,0 +1,42 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_c_3_ft.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * -------------------------------------------------------------------------- + * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * * Refactor kernels for seperate compilation + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 16, 1, 32, 3, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_f4.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_f4.cu new file mode 100644 index 00000000..c3ae3d26 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_f4.cu @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_f4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 1, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 1, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 1, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 1, true, true > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 2, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 2, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 2, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 2, true, true > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 3, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 2, 4, 32, 3, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_pt_4.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_pt_4.cu new file mode 100644 index 00000000..a5de5b13 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_pt_4.cu @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_16_pt_4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 1, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 1, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 1, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 1, true, true > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 2, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 2, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 2, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 2, true, true > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 3, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 16, 16, 2, 4, 3, 32, 3, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_8.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_8.cu new file mode 100644 index 00000000..d39fe509 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_8.cu @@ -0,0 +1,50 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/weight_acts_c_kepler_sw_by_8.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_c_kepler_sw.cuh" + +namespace megdnn { +namespace cuda { + + WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 1, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 1, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 1, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 1, true, true > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 2, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 2, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 2, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 2, true, true > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 3, false, false > (C_KEP_SW_PARAM); + WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 3, false, true > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 3, true, false > (C_KEP_SW_PARAM); + //WET_ACT_C_KEPLER_SW_HEAD< 8, 16, 2, 2, 2, 16, 3, true, true > (C_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler.cuh b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler.cuh new file mode 100644 index 00000000..4ef3ef6a --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler.cuh @@ -0,0 +1,233 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters + * threadIdx.x determines filter + * threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... + * so the compiler is messing up here somehow. It's unable to optimize that case away. + */ +template +__global__ void conv_weight_acts_c_kepler(float* images, float* hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int partialSum, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels + __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int filterBlocksPerModule = numFilters / (B_X*filtersPerThread); + const int outputModuleIdx = blockIdx.x / filterBlocksPerModule; + const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % filterBlocksPerModule); + +// const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + + images += loadX; + hidActs += blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + + loadX; + + targets += (outputModuleIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + float prod[numColors][pixelsPerThread][filtersPerThread]; + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + + __shared__ int pxIdxes[B_Y*pixelsPerThread]; + fill_shared_mem((int *)pxIdxes, sizeof(pxIdxes)/sizeof(int), 0); + __syncthreads(); + //__shared__ bool isPxInImage[B_Y*pixelsPerThread]; + for (int m = moduleIdx; m < moduleIdx + partialSum; m++) { + + __syncthreads(); + if (tidx < B_Y * pixelsPerThread) { + const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride; + int pxY = (imgLoadModPosY + (blockPixelOffset + tidx) / filterSize); + int pxX = (imgLoadModPosX + (blockPixelOffset + tidx) % filterSize); + int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; + //isPxInImage[tidx] = ; + } + __syncthreads(); + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + if (/*loadY < B_X*filtersPerThread &&*/ (!checkCaseBounds || caseIdx + loadX < numImages)) { + #pragma unroll + for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X*filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X*filtersPerThread) { + shHidActs[loadY+y][loadX]= hidActs[caseIdx + y * numImages * numModules + m * numImages]; + } + } + } + #pragma unroll + for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) { + //if (loadY < B_Y * pixelCache) { // This condition is not necessary for correctness, but it speeds things a bit + /* + * As long as B_Y * B_X is divisible by preloadCases this will loop the right + * number of times. + * + * This will load some imgGrads from filter pixels that don't exit (it'll set those to 0), + * but the code does not produce any output for those pixels (see last lines). + */ + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { + const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter + + if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { + const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; + + if (pixIdx >= 0) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = images[caseIdx + c * imgPixels * imgStride + pixIdx]; + } + } else { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; + } + } + } else { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX]= 0; + } + } + } + } + //} + + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < preloadCases; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int p = 0; p < pixelCache; p++) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + prod[c][pp + p][f] += shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i]; + } + } + } + } + + __syncthreads(); + } + } + } + + if (scale) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[p * B_Y * numFilters + c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler_sw.cuh b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler_sw.cuh new file mode 100644 index 00000000..baba1458 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler_sw.cuh @@ -0,0 +1,279 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_c_kepler_sw.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters + * threadIdx.x determines filter + * threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... + * so the compiler is messing up here somehow. It's unable to optimize that case away. + * To be used when numFilterColors <= 3 + */ +template +__global__ void conv_weight_acts_c_kepler_sw(float* images, float* hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[pixelCache * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels + __shared__ float shHidActs[B_X * filtersPerThread][preloadCases + 1]; // preload preloadCases cases of B_X hidActs + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = DIVUP(numFilters, B_X*filtersPerThread); + + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = B_X * filtersPerThread* (blockIdx.x % numFilterBlocks); + +// const int moduleStride = (imgSize - filterSize + 1) / numModulesX; + const int numModules = numModulesY * numModulesX; + + const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread; + + images += loadX; + hidActs += blockFilterIdx * numImages * numModules +// + loadY * numImages * numModules + + loadX; + + targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.y * numFilters + threadIdx.x; + + //float* shImgLoad = &shImages[loadY][loadX]; + //float* shHidActLoad = &shHidActs[loadY][loadX]; + + float prod[numColors][pixelsPerThread][filtersPerThread]; + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][p][f] = 0; + } + } + } + const int mStartX = blockModuleStartX; + const int mStartY = blockModuleStartY; + const int mEndX = min(numModulesX, blockModuleStartX + sumWidth); + const int mEndY = min(numModulesY, blockModuleStartY + sumWidth); + +// if (mStartY == mEndY || mStartX == mEndX) { +// return; +// } + + const int fYOff = (blockPixelOffset + tidx) / filterSize; + const int fXOff = (blockPixelOffset + tidx) % filterSize; + __shared__ int pxIdxes[B_Y*pixelsPerThread]; + fill_shared_mem((int *)pxIdxes, sizeof(pxIdxes)/sizeof(int), 0); + __syncthreads(); + for (int my = mStartY; my < mEndY; my++) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + for (int mx = mStartX; mx < mEndX; mx++) { + const int m = my * numModulesX + mx; + + __syncthreads(); + const int imgLoadModPosX = paddingStart + mx * moduleStride; + if (tidx < B_Y * pixelsPerThread) { +// const int imgLoadModPosY = paddingStart + my * moduleStride; +// const int imgLoadModPosX = paddingStart + mx * moduleStride; + int pxY = (imgLoadModPosY + fYOff); + int pxX = (imgLoadModPosX + fXOff); + int pixIdx = (pxY * imgSizeX + pxX) * imgStride; + pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX ? pixIdx : -1; + } + __syncthreads(); + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + if (//loadY < B_X*filtersPerThread && + (!checkCaseBounds || caseIdx + loadX < numImages)) { + #pragma unroll + for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { + const int fIdx = ((loadY + y) % filtersPerThread) * B_X + (loadY + y) / filtersPerThread; + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X*filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || loadY+y < B_X*filtersPerThread) { + if (blockFilterIdx + fIdx < numFilters) { + shHidActs[loadY+y][loadX]= hidActs[caseIdx + (fIdx * numModules + m) * numImages]; + } else { + shHidActs[loadY+y][loadX] = 0; + } + } + } + } else { + #pragma unroll + for (int y = 0; y < B_X*filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // const int fIdx = ((loadY + y) % filtersPerThread) * B_X + (loadY + y) / filtersPerThread; + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X*filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || loadY+y < B_X*filtersPerThread) { + shHidActs[loadY+y][loadX] = 0; + } + } + } + #pragma unroll + for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) { + //if (loadY < B_Y * pixelCache) { // This condition is not necessary for correctness, but it speeds things a bit + // + //As long as B_Y * B_X is divisible by preloadCases this will loop the right + //number of times. + // + //This will load some imgGrads from filter pixels that don't exit (it'll set those to 0), + //but the code does not produce any output for those pixels (see last lines). + // + #pragma unroll + for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelCache) { + const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter + + if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) { + const int pixIdx = pxIdxes[pxIdx];//(pxY * imgSizeX + pxX) * imgStride; + + if (pixIdx >= 0) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = images[caseIdx + c * imgPixels * imgStride + pixIdx]; + } + } else { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX] = 0; + } + } + } else { + #pragma unroll + for (int c = 0; c < numColors; c++) { + shImages[loadY+y + c * pixelCache * B_Y][loadX]= 0; + } + } + } + } + //} + + __syncthreads(); + + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int i = 0; i < preloadCases; i++) { + #pragma unroll + for (int p = 0; p < pixelCache; p++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (blockFilterIdx + threadIdx.x + f * B_X < numFilters) { + prod[c][pp + p][f] += shImages[threadIdx.y + (p + c * pixelCache) * B_Y][i] * shHidActs[threadIdx.x * filtersPerThread + f][i]; + } + } + } + } + } + + __syncthreads(); + } + } + } + } + if (scale) { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[(p * B_Y + c * filterPixels) * numFilters + f * B_X] = scaleTargets * targets[(p * B_Y + c * filterPixels) * numFilters + f * B_X] + scaleOutputs * prod[c][p][f]; + } + } + } + } + } else { + #pragma unroll + for (int p = 0; p < pixelsPerThread; p++) { + if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) { + #pragma unroll + for (int c = 0; c < numColors; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (blockFilterIdx + threadIdx.x + f * B_X < numFilters) { + targets[(p * B_Y + c * filterPixels) * numFilters + f * B_X] = scaleOutputs * prod[c][p][f]; + } + } + } + } + } + } +} + + +#define WET_ACT_C_KEPLER_SW_HEAD template __global__ void conv_weight_acts_c_kepler_sw + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler.cuh b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler.cuh new file mode 100644 index 00000000..b75ed25e --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler.cuh @@ -0,0 +1,201 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_templates.cuh" + +namespace megdnn { +namespace cuda { +/* + * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread colors and B_X * filtersPerThread filters + * threadIdx.x determines filter + * threadIdx.y determines color + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines color batch of B_Y * colorsPerThread + * blockIdx.z determines pixel in filter + * NOTE: blockIdx.z is limited to values < 2^16. This means that this routine will + * fail for filters >= 256*256. I'm assuming I won't ever use such large filters. + + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + + * B_X * B_Y must be divisible by preloadCases + */ +template +__global__ void conv_weight_acts_mc_mf_kepler(float* images, float* hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int numImgColors, const int numGroups, const int partialSum, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases + __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int outputModuleIdx = blockIdx.x / numFilterBlocks; + const int moduleIdx = partialSum * outputModuleIdx; + const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + + images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + + hidActs += + blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + + loadX; + + targets += outputModuleIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.x; + //if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + float prod[colorsPerThread][filtersPerThread]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][f] = 0; + } + } + + for (int m = moduleIdx; m < moduleIdx + partialSum; m++) { + const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride; + const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image + const int pxX = imgLoadModPosX + blockPixelX; + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image + if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) { + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + // Checking this condition actually makes things faster ... :/ + // So I've removed the !checkCaseBounds flag and just check it all the time. + if (caseIdx + loadX < numImages) { + /* + * As long as B_Y * B_X is divisible by preloadCases this will loop the right + * number of times. + * + * This will load some images from filter pixels that don't exist (it'll set those to 0), + * but the code does not produce any output for those pixels (see last lines). + */ + if (loadY < B_Y * colorsPerThread) { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { + shImgLoad[(y) * preloadCases] = images[caseIdx + y * imgPixels * imgStride + pixIdx]; + } + } + } + + if (loadY < B_X * filtersPerThread) { + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + (y * numModules + m) * numImages]; + } + } + } + } else { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { + shImgLoad[(y) * preloadCases] = 0; + } + } + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = 0; + } + } + } + + __syncthreads(); + #pragma unroll + for (int i = 0; i < preloadCases; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i]; + } + } + } + __syncthreads(); + } + } + } + if (scale) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f]; + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f]; + } + } + } +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw.cuh b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw.cuh new file mode 100644 index 00000000..96f2f944 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw.cuh @@ -0,0 +1,280 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_templates.cuh" + +namespace megdnn { +namespace cuda { + +/* + * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread colors and B_X * filtersPerThread filters + * threadIdx.x determines filter + * threadIdx.y determines color + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines color batch of B_Y * colorsPerThread + * blockIdx.z determines pixel in filter + * NOTE: blockIdx.z is limited to values < 2^16. This means that this routine will + * fail for filters >= 256*256. I'm assuming I won't ever use such large filters. + + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/d, numFilterColors, filterPixels, numFilters) + + * B_X * B_Y must be divisible by preloadCases + * To be used when numFilterColors > 3 && numFilterColors % 16 == 0 + */ +template +__global__ void conv_weight_acts_mc_mf_kepler_sw(float* images, float* hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int numImgColors, const int numGroups, const int sumWidth, + const float scaleTargets, const float scaleOutputs) { + __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases + __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts + fill_shared_mem((float *)shImages, sizeof(shImages)/sizeof(float), 0); + fill_shared_mem((float *)shHidActs, sizeof(shHidActs)/sizeof(float), 0); + __syncthreads(); + + const int tidx = B_X * threadIdx.y + threadIdx.x; + const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + //const int numFilterBlocks = numFilters / (B_X * filtersPerThread); + const int numFilterBlocks = DIVUP(numFilters, (B_X * filtersPerThread)); + const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; + + const int numModuleChunksX = DIVUP(numModulesX, sumWidth); +// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + + const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; + const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; + + const int blockModuleStartX = blockModuleChunkX * sumWidth; + const int blockModuleStartY = blockModuleChunkY * sumWidth; + + const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); + const int numModules = numModulesY * numModulesX; + + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + const int numFilterColors = numImgColors / numGroups; + + const int blockPixelOffset = blockIdx.z; // pixel idx in filter + const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; + const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; + const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; + + images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + + hidActs += + blockFilterIdx * numImages * numModules + + loadY * numImages * numModules + + loadX; + + targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors + + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + blockPixelOffset * numFilters + + blockFilterIdx + + threadIdx.x; + + //if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); + const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); + const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); + const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + +// if (mStartY == mEndY || mStartX == mEndX) { +// return; +// } + + float* shHidActLoad = &shHidActs[loadY][loadX]; + float* shImgLoad = &shImages[loadY][loadX]; + float prod[colorsPerThread][filtersPerThread]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + prod[c][f] = 0; + } + } + /* + * Note; iterating this way is about 1% slower and uses a few more registers than iterating + * over the modules linearly. But it's consistent with the preload routines, + * so I'm using it. + */ + for (int my = mStartY; my < mEndY; my++) { + const int imgLoadModPosY = paddingStart + my * moduleStride; + const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image + for (int mx = mStartX; mx < mEndX; mx++) { + const int m = my * numModulesX + mx; + const int imgLoadModPosX = paddingStart + mx * moduleStride; + const int pxX = imgLoadModPosX + blockPixelX; + const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image + for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { + // Checking this condition actually makes things faster ... :/ + // So I've removed the !checkCaseBounds flag and just check it all the time. + if (caseIdx + loadX < numImages) { + /* + * As long as B_Y * B_X is divisible by preloadCases this will loop the right + * number of times. + * + * This will load some images from filter pixels that don't exist (it'll set those to 0), + * but the code does not produce any output for those pixels (see last lines). + */ + if (loadY < B_Y * colorsPerThread) { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || + y + loadY < B_Y*colorsPerThread) { + if(y + loadY + imgColorIdx < numImgColors) { + shImgLoad[(y) * preloadCases] = images[caseIdx + y * imgPixels * imgStride + pixIdx]; + } else { + shImgLoad[(y) * preloadCases] = 0; + } + } + } + } + + if (loadY < B_X * filtersPerThread) { + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { + if (blockFilterIdx + loadY + y < numFilters) { + shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + (y * numModules + m) * numImages]; + } else if (loadY + y < filtersPerThread * B_X) { + shHidActLoad[y * (preloadCases + 1)] = 0; + } + } + } + } + } else { + #pragma unroll + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_Y*colorsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y*colorsPerThread) { + shImgLoad[(y) * preloadCases] = 0; + } + } + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Make sure number of rows in the array is divisible by number of rows filled per iteration + if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) { + shHidActLoad[y * (preloadCases + 1)] = 0; + } + } + } + + __syncthreads(); + #pragma unroll + for (int i = 0; i < preloadCases; i++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + if (blockFilterIdx + threadIdx.x + f * B_X < numFilters) { + prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i]; + } + } + } + } + __syncthreads(); + } + + } + } + if (scale) { + //#pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if (blockFilterIdx + threadIdx.x + f * B_X < numFilters) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = + scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f]; + } + } + } + } else { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + if ((blockFilterIdx + threadIdx.x + f * B_X < numFilters) && + (c * B_Y + blockFilterColorIdx + threadIdx.y < numImgColors)) { + targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f]; + } + } + } + } +} + +#define WET_ACT_MC_MF_KEPLER_SW_HEAD template __global__ void conv_weight_acts_mc_mf_kepler_sw +#define WET_ACT_MC_MF_KEPLER_SW_4_A(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,16,1,4,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,16,1,8,32,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_4_B(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,16,2,4,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,16,2,8,32,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_4_C(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,16,4,4,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,16,4,8,16,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_4_D(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,32,4,4,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<4,32,4,8,16,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_8_A(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,16,1,6,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,16,1,8,32,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_8_B(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,16,2,6,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,16,2,8,32,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_8_C(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,16,4,6,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,16,4,8,16,scale> (MC_MF_KEP_SW_PARAM); + +#define WET_ACT_MC_MF_KEPLER_SW_8_D(scale) \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,32,4,6,32,scale> (MC_MF_KEP_SW_PARAM); \ + WET_ACT_MC_MF_KEPLER_SW_HEAD<8,32,4,8,16,scale> (MC_MF_KEP_SW_PARAM); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_A_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_A_scale_f.cu new file mode 100644 index 00000000..e726d090 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_A_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_A_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_4_A(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_B_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_B_scale_f.cu new file mode 100644 index 00000000..5d0e752f --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_B_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_B_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_4_B(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_C_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_C_scale_f.cu new file mode 100644 index 00000000..66fd5843 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_C_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_C_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_4_C(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_D_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_D_scale_f.cu new file mode 100644 index 00000000..d6833e52 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_D_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_4_D_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_4_D(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_A_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_A_scale_f.cu new file mode 100644 index 00000000..a9e48071 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_A_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_A_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_8_A(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_B_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_B_scale_f.cu new file mode 100644 index 00000000..9ab95d72 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_B_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_B_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_8_B(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_C_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_C_scale_f.cu new file mode 100644 index 00000000..902a2d0a --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_C_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_C_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_8_C(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_D_scale_f.cu b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_D_scale_f.cu new file mode 100644 index 00000000..b896dfb3 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_D_scale_f.cu @@ -0,0 +1,38 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_mc_mf_kepler_sw_by_8_D_scale_f.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "wet_act_mc_mf_kepler_sw.cuh" +namespace megdnn { +namespace cuda { + +WET_ACT_MC_MF_KEPLER_SW_8_D(false) + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_templates.cuh b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_templates.cuh new file mode 100644 index 00000000..14323838 --- /dev/null +++ b/dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_templates.cuh @@ -0,0 +1,211 @@ +/** + * \file dnn/src/cuda/local/cuda-convnet2/weight_acts/wet_act_templates.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +/** + * Copyright 2014 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * -------------------------------------------------------------------------- + * * This file has been modified by Megvii ("Megvii Modifications"). + * * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. + * -------------------------------------------------------------------------- + */ +#include "../nvmatrix.cuh" +#include "../cudaconv2.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +#define LO16(x) ((x) & 0x0000FFFF) +#define HI16(x) ((x) >> 16) + +#define WA_LOOP(r) _Pragma("unroll") \ +for (int c = 0; c < colorsPerThread; c++) { \ + _Pragma("unroll") \ + for (int f = 0; f < filtersPerThread; f++) { \ + prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * shHidActs[threadIdx.x + f * B_X][(r)]; \ + } \ +} + +#define WA_LOOP2(r) _Pragma("unroll") \ +for (int f = 0; f < filtersPerThread; f++) { \ + _Pragma("unroll") \ + for (int c = 0; c < colorsPerThread; c++) { \ + prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * shHidActs[threadIdx.x + f * B_X][(r)]; \ + } \ +} + +#define WA_IMLOAD(r) imPreload[r] = im[(r) * B_X * B_Y / preloadCases * imgPixels * imgStride]; +#define WA_IMLOAD_TX(r) imPreload[r] = tex1Dfetch(images, imgOffset2 + (r) * B_X * B_Y / preloadCases * imgPixels * imgStride); +#define WA_HALOAD(r) haPreload[r] = ha[(r) * B_X * B_Y / preloadCases * numImages * numModules]; +#define WA_HALOAD_TX(r) haPreload[r] = tex1Dfetch(hidActs, hidActsOffset2 + (r) * B_X * B_Y / preloadCases * numImages * numModules); + +#define C_KEP_PARAM float* images, float* hidActs, float* targets, \ + const int numImages, const int numFilters, \ + const int numModulesY, const int numModulesX, \ + const int imgSizeY, const int imgSizeX, \ + const int filterSize, const int paddingStart, \ + const int moduleStride, const int imgStride, \ + const int partialSum, \ + const float scaleTargets, const float scaleOutputs +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters + * threadIdx.x determines filter + * threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... + * so the compiler is messing up here somehow. It's unable to optimize that case away. + */ +template +__global__ void conv_weight_acts_c_kepler(C_KEP_PARAM); + + + +#define MC_MF_KEP_PARAM float* images, \ + float* hidActs, float* targets, \ + const int numImages, const int numFilters, \ + const int numModulesY, const int numModulesX, \ + const int imgSizeY, const int imgSizeX, \ + const int filterSize, const int paddingStart, \ + const int moduleStride, const int imgStride, \ + const int numImgColors, const int numGroups, \ + const int partialSum, \ + const float scaleTargets, const float scaleOutputs +/* + * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread colors and B_X * filtersPerThread filters + * threadIdx.x determines filter + * threadIdx.y determines color + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines color batch of B_Y * colorsPerThread + * blockIdx.z determines pixel in filter + * NOTE: blockIdx.z is limited to values < 2^16. This means that this routine will + * fail for filters >= 256*256. I'm assuming I won't ever use such large filters. + + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + + * B_X * B_Y must be divisible by preloadCases + */ +template +__global__ void conv_weight_acts_mc_mf_kepler(MC_MF_KEP_PARAM); + +#define MC_MF_KEP_SW_PARAM float* images, \ + float* hidActs, float* targets, \ + const int numImages, const int numFilters, \ + const int numModulesY, const int numModulesX, \ + const int imgSizeY, const int imgSizeX, const \ + int filterSize, const int paddingStart, \ + const int moduleStride, const int imgStride, \ + const int numImgColors, const int numGroups, \ + const int sumWidth, \ + const float scaleTargets, const float scaleOutputs +/* + * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread colors and B_X * filtersPerThread filters + * threadIdx.x determines filter + * threadIdx.y determines color + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines color batch of B_Y * colorsPerThread + * blockIdx.z determines pixel in filter + * NOTE: blockIdx.z is limited to values < 2^16. This means that this routine will + * fail for filters >= 256*256. I'm assuming I won't ever use such large filters. + + * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters) + + * B_X * B_Y must be divisible by preloadCases + */ +template +__global__ void conv_weight_acts_mc_mf_kepler_sw(MC_MF_KEP_SW_PARAM); + + + + +#define C_KEP_SW_PARAM float* images, \ + float* hidActs, float* targets, \ + const int numImages, const int numFilters, \ + const int numModulesY, const int numModulesX, \ + const int imgSizeY, const int imgSizeX, \ + const int filterSize, const int paddingStart, \ + const int moduleStride, const int imgStride, \ + const int sumWidth, \ + const float scaleTargets, const float scaleOutputs +/* + * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters + * threadIdx.x determines filter + * threadIdx.y determines pixel in filter + * + * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum + * blockIdx.y determines pixel batch of B_Y * pixelsPerThread + * + * Number of filters must be divisible by B_X * filtersPerThread + * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false. + * + * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given + * hidActs: (numFilters, numModulesY, numModulesX, numImages) + * + * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters) + * + * B_Y * B_X should be divisible by preloadCases. + * preloadCases one of 16, 32. + * B_X one of 4, 8, 16, 32 + * B_Y arbitrary (satisfying divisibility constraints) + * numModules must be divisible by partialSum + * pixelsPerThread must be divisible by pixelCache + * + * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)... + * so the compiler is messing up here somehow. It's unable to optimize that case away. + */ +template +__global__ void conv_weight_acts_c_kepler_sw(float* images, float* hidActs, float* targets, + const int numImages, const int numFilters, + const int numModulesY, const int numModulesX, + const int imgSizeY, const int imgSizeX, const int filterSize, + const int paddingStart, const int moduleStride, const int imgStride, + const int sumWidth, + const float scaleTargets, const float scaleOutputs); + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/local/forward.cpp b/dnn/src/cuda/local/forward.cpp new file mode 100644 index 00000000..24b2af6c --- /dev/null +++ b/dnn/src/cuda/local/forward.cpp @@ -0,0 +1,165 @@ +/** + * \file dnn/src/cuda/local/forward.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local/opr_impl.h" + +#include "src/cuda/local/local.cuh" +#include "src/cuda/utils.h" +#include "src/cuda/handle.h" + +namespace megdnn { +namespace cuda { +namespace local { + +void check_input(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW, + bool is_xcorr) +{ + megdnn_ignore(N); + megdnn_ignore(IC); + megdnn_ignore(IH); + megdnn_ignore(IW); + megdnn_ignore(OC); + megdnn_ignore(OH); + megdnn_ignore(OW); + megdnn_ignore(FH); + megdnn_ignore(FW); + megdnn_ignore(INs); + megdnn_ignore(ONs); + megdnn_ignore(PH); + megdnn_ignore(PW); + megdnn_ignore(SH); + megdnn_ignore(SW); + megdnn_ignore(is_xcorr); + // shared memory constraint + megdnn_assert(IH*IW <= 768, "spatial size should not be larger than 768."); + // megdnn_assert(4 * 4 * 4 * IH * IW <= 49152); +} + +} // namespace local +} // namespace cuda +} // namespace megdnn + +namespace megdnn { +namespace cuda { + +void LocalForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + megdnn_assert(src.layout.dtype == dtype::Float32(), + "cuda do not support fp16 local operator"); + check_exec(src.layout, filter.layout, dst.layout, workspace.size); + bool is_xcorr = param().mode == Mode::CROSS_CORRELATION; + auto N = src.layout.shape[0], + IC = src.layout.shape[1], + IH = src.layout.shape[2], + IW = src.layout.shape[3]; + auto OC = dst.layout.shape[1], + OH = dst.layout.shape[2], + OW = dst.layout.shape[3]; + auto FH = filter.layout.shape[3], + FW = filter.layout.shape[4]; + auto handle = concrete_handle(this->handle()); + auto stream = cuda_stream(this->handle()); + auto cublas = cublas_handle(this->handle()); + auto one = handle->one_device(); + auto zero = handle->zero_device(); + if (use_cuda_convnet(src.layout, filter.layout, dst.layout)) { + local::forward_proxy_convnet(src.ptr(), + filter.ptr(), + dst.ptr(), + reinterpret_cast(workspace.raw_ptr), + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + param().pad_h, param().pad_w, + param().stride_h, param().stride_w, + cublas, stream, + one, zero); + } else { + local::check_input(N, IC, IH, IW, OC, OH, OW, FH, FW, + IC*IH*IW, OC*OH*OW, + param().pad_h, param().pad_w, + param().stride_h, param().stride_w, + is_xcorr); + local::forward_proxy_weiming(src.ptr(), + filter.ptr(), + dst.ptr(), + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + param().pad_h, param().pad_w, + param().stride_h, param().stride_w, + is_xcorr, + stream); + } +} + +size_t LocalForwardImpl::get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) +{ + size_t res = 0u; + auto N = src.shape[0], + IC = src.shape[1], IH = src.shape[2], IW = src.shape[3], + OC = dst.shape[1], OH = dst.shape[2], OW = dst.shape[3], + FH = filter.shape[3], FW = filter.shape[4]; + auto PH = param().pad_h, PW = param().pad_w, + SH = param().stride_h, SW = param().stride_w; + if (use_cuda_convnet(src, filter, dst)) { + res = local::get_workspace_in_floats_forward_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + PH, PW, + SH, SW) * sizeof(dt_float32); + } else { + res = 0u; + } + return res; +} + +bool LocalForwardImpl::use_cuda_convnet(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) +{ + auto N = src.shape[0], + IC = src.shape[1], IH = src.shape[2], IW = src.shape[3], + OC = dst.shape[1], OH = dst.shape[2], OW = dst.shape[3], + FH = filter.shape[3], FW = filter.shape[4]; + auto PH = param().pad_h, PW = param().pad_w, + SH = param().stride_h, SW = param().stride_w; + return param().mode == Mode::CROSS_CORRELATION && + local::can_forward_proxy_convnet(N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + IC*IH*IW, OC*OH*OW, + PH, PW, + SH, SW); +} + +} // namespace cuda +} // namespace megdnn + + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/forward.cu b/dnn/src/cuda/local/forward.cu new file mode 100644 index 00000000..d5cc696b --- /dev/null +++ b/dnn/src/cuda/local/forward.cu @@ -0,0 +1,210 @@ +/** + * \file dnn/src/cuda/local/forward.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local/local.cuh" + +#include "src/cuda/utils.cuh" +#include "src/cuda/local/cuda-convnet2/nvmatrix.cuh" +#include "src/cuda/local/cuda-convnet2/cudaconv2.cuh" + +namespace megdnn { +namespace cuda { +namespace local { + +// blockIdx.y is OC*OH*OW/1024 +// blockIdx.x is N/4 +// threadIdx.x is [0, 1024) +template +__global__ void forward_kernel(const float * __restrict__ src, + const float * __restrict__ filter, + float * __restrict__ dst, + uint32_t N, + uint32_t IC, uint32_t IH, uint32_t IW, + uint32_t OC, uint32_t OH, uint32_t OW, + uint32_t FH, uint32_t FW, + uint32_t INs, size_t ONs, + uint32_t PH, uint32_t PW, + uint32_t SH, uint32_t SW) +{ + // Ns*ICs*sizeof(float)*IH*IW + extern __shared__ float shared_mem[]; + float *src_cache = shared_mem; + uint32_t tid = threadIdx.x; + uint32_t tstride = blockDim.x; + uint32_t oid = tid + blockIdx.y * tstride; + src += blockIdx.x*Ns * INs; + dst += blockIdx.x*Ns * ONs; + uint32_t op = oid / OC; + uint32_t oc = oid % OC; + uint32_t oh = op / OW; + uint32_t ow = op % OW; + float dst_reg[Ns]; + for (uint32_t no = 0; no < Ns; ++no) dst_reg[no] = 0.0f; + uint32_t Nb = min(N-blockIdx.x*Ns, Ns); + for (uint32_t ic = 0; ic < IC; ic += ICs) { + // read ICs-channel src + // (Ns, ICs, IHs, IWs) + uint32_t ICb = min(ICs, IC-ic); + for (uint32_t i = tid; i < Nb*ICs*IH*IW; i += tstride) { + uint32_t ip = i % (IH*IW); + uint32_t ico = i / (IH*IW) % ICs; + uint32_t no = i / (IH*IW) / ICs; + src_cache[i] = + (ico < ICb) * src[no*INs + min(IC-1, (ic+ico))*IH*IW + ip]; + } + __syncthreads(); + if (oid < OC*OH*OW) + for (uint32_t fh = 0; fh < FH; ++fh) + { + uint32_t ih; + if (is_xcorr) ih = oh*SH + fh - PH; else ih = oh*SH + (FH-fh-1) - PH; + if (ih < IH) + for (uint32_t fw = 0; fw < FW; ++fw) + { + uint32_t iw; + if (is_xcorr) iw = ow*SW + fw - PW; else iw = ow*SW + (FW-fw-1) - PW; + if (iw < IW) + for (uint32_t ico = 0; ico < ICb; ++ico) { + uint32_t fid = op*IC*FH*FW*OC + (ic+ico)*FH*FW*OC + + fh*FW*OC + fw*OC + oc; + float fval = filter[fid]; + float src_reg[Ns]; +#pragma unroll + for (uint32_t no = 0; no < Ns; ++no) { + src_reg[no] = src_cache[no*ICs*IH*IW + ico*IH*IW + ih*IW + iw]; + } +#pragma unroll + for (uint32_t no = 0; no < Ns; ++no) { + dst_reg[no] += src_reg[no]*fval; + } + } + } + } + __syncthreads(); + } + if (oid < OC*OH*OW) { + for (uint32_t no = 0; no < Nb; ++no) { + dst[no*ONs + oc*OH*OW + op] = dst_reg[no]; + } + } +} + +void forward_proxy_weiming(const float *src, const float *filter, float *dst, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW, + bool is_xcorr, + cudaStream_t stream) +{ + size_t threads = 256; + const size_t Ns = 4, ICs = 4; + dim3 blocks = dim3(DIVUP(N, Ns), DIVUP(OC*OH*OW, threads)); + if (is_xcorr) { + forward_kernel<<>>(src, filter, dst, + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + INs, ONs, + PH, PW, + SH, SW); + } else { + forward_kernel<<>>(src, filter, dst, + N, + IC, IH, IW, + OC, OH, OW, + FH, FW, + INs, ONs, + PH, PW, + SH, SW); + } + after_kernel_launch(); +} + +bool can_forward_proxy_convnet(size_t N, + size_t IC, size_t /* IH */, size_t /* IW */, + size_t /*OC*/, size_t /* OH */, size_t /* OW */, + size_t FH, size_t FW, + size_t /* INs */, size_t /* ONs */, + size_t PH, size_t PW, + size_t SH, size_t SW) +{ + bool flag = true; + // check pad + flag &= (PH == PW); + // check stride + flag &= (SH == SW); + // megdnn_assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0))); + flag &= (IC <= 3 || IC % 4 == 0); + // megdnn_assert(numFilters % (16 * numGroups) == 0); + //flag &= (OC % 16 == 0); + // megdnn_assert(filterSize * filterSize == filterPixels); + flag &= (FH == FW); + flag &= (SH <= FH); + flag &= (N % 32 == 0); + return flag; +} + +size_t get_workspace_in_floats_forward_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t /* FH */, size_t /* FW */, + size_t /* INs */, size_t /* ONs */, + size_t /* PH */, size_t /* PW */, + size_t /* SH */, size_t /* SW */) +{ + return N*IC*IH*IW + N*OC*OH*OW; +} + +void forward_proxy_convnet(const float *src, const float *filter, float *dst, + float *workspace, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, // IN stride and ON stride + size_t PH, size_t /* PW */, + size_t SH, size_t /* SW */, + cublasHandle_t cublas_handle, + cudaStream_t stream, + float *one, float *zero) + +{ + MemorySegment msrc_n(const_cast(src)), + mdst_n(dst), + mfilter(const_cast(filter)), + msrc_t(workspace+0), + mdst_t(workspace+N*IC*IH*IW); + NVMatrix nvimage_n(&msrc_n, N, IC*IH*IW, INs); + NVMatrix nvtarget_n(&mdst_n, N, OC*OH*OW, ONs); + NVMatrix nvimage_t(&msrc_t, IC*IH*IW, N); + NVMatrix nvfilter(&mfilter, OH*OW*IC*FH*FW, OC); + NVMatrix nvtarget_t(&mdst_t, OC*OH*OW, N); + + nvimage_n.transpose(nvimage_t, cublas_handle, one, zero); + + localFilterActs(stream, nvimage_t, nvfilter, nvtarget_t, + IH, OH, OW, -static_cast(PH), SH, IC, 1); + after_kernel_launch(); + + nvtarget_t.transpose(nvtarget_n, cublas_handle, one, zero); +} + +} // namespace local +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/local.cuh b/dnn/src/cuda/local/local.cuh new file mode 100644 index 00000000..5ec7c443 --- /dev/null +++ b/dnn/src/cuda/local/local.cuh @@ -0,0 +1,141 @@ +/** + * \file dnn/src/cuda/local/local.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include + +namespace megdnn { +namespace cuda { +namespace local { + +void check_input(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW, + bool is_xcorr); + +void forward_proxy_weiming(const float *src, const float *filter, float *dst, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW, + bool is_xcorr, + cudaStream_t stream); + +/// forward + +bool can_forward_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW); + +void forward_proxy_convnet(const float *src, const float *filter, float *dst, + float *workspace, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, // IN stride and ON stride + size_t PH, size_t PW, + size_t SH, size_t SW, + cublasHandle_t cublas_handle, + cudaStream_t stream, + float *one, float *zero); + +size_t get_workspace_in_floats_forward_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW); + +/// bwd data + +bool can_backward_data_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW); + +void backward_data_proxy_convnet(const float *filter, + const float *diff, + float *grad, + float *workspace, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, // IN stride and ON stride + size_t PH, size_t PW, + size_t SH, size_t SW, + cublasHandle_t cublas_handle, + cudaStream_t stream, + float *one, float *zero); + +size_t get_workspace_in_floats_backward_data_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW); + +/// bwd filter + +bool can_backward_filter_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW); + +void backward_filter_proxy_convnet(const float *src, + const float *diff, + float *grad, + float *workspace, + size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, // IN stride and ON stride + size_t PH, size_t PW, + size_t SH, size_t SW, + cublasHandle_t cublas_handle, + cudaStream_t stream, + float *one, float *zero); + +size_t get_workspace_in_floats_backward_filter_proxy_convnet(size_t N, + size_t IC, size_t IH, size_t IW, + size_t OC, size_t OH, size_t OW, + size_t FH, size_t FW, + size_t INs, size_t ONs, + size_t PH, size_t PW, + size_t SH, size_t SW); + +} // namespace local +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local/opr_impl.h b/dnn/src/cuda/local/opr_impl.h new file mode 100644 index 00000000..81d76967 --- /dev/null +++ b/dnn/src/cuda/local/opr_impl.h @@ -0,0 +1,70 @@ +/** + * \file dnn/src/cuda/local/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/cuda/cudnn_wrapper.h" + +namespace megdnn { +namespace cuda { + +class LocalForwardImpl final: public LocalForward { + public: + using LocalForward::LocalForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst) override; + private: + bool use_cuda_convnet(const TensorLayout &src, + const TensorLayout &filter, + const TensorLayout &dst); +}; + +class LocalBackwardDataImpl final: public LocalBackwardData { + public: + using LocalBackwardData::LocalBackwardData; + void exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad) override; + private: + bool use_cuda_convnet(const TensorLayout &filter, + const TensorLayout &diff, + const TensorLayout &grad); +}; + +class LocalBackwardFilterImpl final: public LocalBackwardFilter { + public: + using LocalBackwardFilter::LocalBackwardFilter; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_in grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad) override; + private: + bool use_cuda_convnet(const TensorLayout &src, + const TensorLayout &diff, + const TensorLayout &grad); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_data/algo.cpp b/dnn/src/cuda/local_share/backward_data/algo.cpp new file mode 100644 index 00000000..0e3f26b8 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_data/algo.cpp @@ -0,0 +1,54 @@ +/** + * \file dnn/src/cuda/local_share/backward_data/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +LocalShareBackwardDataImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&implicit_gemm); + all_algos.push_back(&batched_matmul); +} + +LocalShareBackwardDataImpl::AlgoPack LocalShareBackwardDataImpl::sm_algo_pack; + +LocalShareBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( + LocalShareBackwardDataImpl* o, const TensorLayout& filter, + const TensorLayout& diff, const TensorLayout& grad) + : opr{o}, filter_layout{filter}, diff_layout{diff}, grad_layout{grad} {} + +LocalShareBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs(LocalShareBackwardDataImpl* opr, + _megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) + : SizeArgs(opr, filter.layout, diff.layout, grad.layout), + filter_tensor{&filter}, + diff_tensor{&diff}, + grad_tensor{&grad}, + workspace{workspace} {} + +std::string LocalShareBackwardDataImpl::AlgoBase::SizeArgs::to_string() const { + auto&& param = opr->param(); + MEGDNN_MARK_USED_VAR(param); + return megdnn_mangle(ssprintf( + "filter=%s, diff=%s, grad=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s->%s", + filter_layout.to_string().c_str(), diff_layout.to_string().c_str(), + grad_layout.to_string().c_str(), param.pad_h, param.pad_w, + param.stride_h, param.stride_w, param.dilate_h, param.dilate_w, + static_cast(param.mode), filter_layout.dtype.name(), + diff_layout.dtype.name(), grad_layout.dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_data/algo.h b/dnn/src/cuda/local_share/backward_data/algo.h new file mode 100644 index 00000000..7c5f2e8a --- /dev/null +++ b/dnn/src/cuda/local_share/backward_data/algo.h @@ -0,0 +1,113 @@ +/** + * \file dnn/src/cuda/local_share/backward_data/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/handle.h" +#include "src/cuda/local_share/opr_impl.h" + +namespace megdnn { +namespace cuda { + +class LocalShareBackwardDataImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + LocalShareBackwardDataImpl* opr; + TensorLayout filter_layout, diff_layout, grad_layout; + + std::string to_string() const; + SizeArgs(LocalShareBackwardDataImpl* opr, const TensorLayout& filter, + const TensorLayout& diff, const TensorLayout& grad); + }; + struct ExecArgs : public SizeArgs { + const TensorND *filter_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(LocalShareBackwardDataImpl* opr, _megdnn_tensor_in filter, + _megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "local share conv fwd algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class LocalShareBackwardDataImpl::AlgoImplicitGemm final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "LOCAL_SHARE_IMPLICIT_GEMM"; + } +}; + +class LocalShareBackwardDataImpl::AlgoBatchedMatMul final + : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "LOCAL_SHARE_BATCHED_MATMUL"; + } +}; + +class LocalShareBackwardDataImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + AlgoImplicitGemm implicit_gemm; + AlgoBatchedMatMul batched_matmul; + + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp b/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp new file mode 100644 index 00000000..afff3129 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_data/batched_matmul.cpp @@ -0,0 +1,145 @@ +/** + * \file dnn/src/cuda/local_share/backward_data/batched_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "src/cuda/local_share/im2col.cuh" +#include "src/cuda/local_share/opr_impl.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareBackwardDataImpl::AlgoBatchedMatMul::is_available( + const SizeArgs& args) const { + using Param = LocalShare::Param; + using Format = Param::Format; + using Mode = Param::Mode; + auto&& param = args.opr->param(); + auto format = param.format; + auto mode = param.mode; + bool available = true; + // format must be nchw + available &= (format == Format::NCHW); + // mode must be cross correlation + available &= (mode == Mode::CROSS_CORRELATION); + auto filter_dtype = args.filter_layout.dtype, + diff_dtype = args.diff_layout.dtype, + grad_dtype = args.grad_layout.dtype; + // only support float32 + available &= (filter_dtype == diff_dtype && filter_dtype == grad_dtype && + filter_dtype == dtype::Float32()); + // do not support dilate conv + size_t dh = param.dilate_h, dw = param.dilate_w; + available &= (dh == 1 && dw == 1); + return available; +} + +WorkspaceBundle +LocalShareBackwardDataImpl::AlgoBatchedMatMul::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.grad_layout, args.filter_layout, + args.diff_layout, param); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + size_t groups = 1; + if (param.sparse == Sparse::GROUP) { + groups = args.filter_layout.shape[0]; + } + size_t icpg = ci / groups, ocpg = co / groups; + size_t ws_pretranspose = n * co * ho * wo * args.diff_layout.dtype.size(); + size_t ws_col2im = + n * ci * ho * wo * fh * fw * args.grad_layout.dtype.size(); + auto&& matmul_opr = args.opr->handle()->create_operator(); + TensorLayout A{{groups * sgh * sgw, icpg * fh * fw, ocpg}, + dtype::Float32()}; + TensorLayout B{{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + TensorLayout C{ + {groups * sgh * sgw, icpg * fh * fw, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + size_t ws_matmul = matmul_opr->get_workspace_in_bytes(A, B, C); + WorkspaceBundle ws{raw_ptr, {ws_pretranspose, ws_col2im, ws_matmul}}; + return ws; +} + +size_t LocalShareBackwardDataImpl::AlgoBatchedMatMul::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void LocalShareBackwardDataImpl::AlgoBatchedMatMul::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.grad_layout, args.filter_layout, + args.diff_layout, param); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + size_t groups = 1; + if (param.sparse == Sparse::GROUP) { + groups = args.filter_layout.shape[0]; + } + size_t icpg = ci / groups, ocpg = co / groups; + local_share::Param kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + + auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ws_pretranspose = ws.get(0); + auto ws_col2im = ws.get(1); + auto ws_matmul = ws.get(2); + + { + TensorLayout B1{{groups, sgh, sgw, ocpg, ho / sgh, wo / sgw, n}, + dtype::Float32()}; + B1.stride[0] = wo * ho * ocpg; + B1.stride[1] = wo * ho / sgh; + B1.stride[2] = wo / sgw; + B1.stride[3] = wo * ho; + B1.stride[4] = wo; + B1.stride[5] = 1; + B1.stride[6] = co * ho * wo; + TensorND ts_B1{args.diff_tensor->raw_ptr, B1}; + TensorLayout B2{{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + B2.init_contiguous_stride(); + TensorND ts_B2{ws_pretranspose, B2}; + auto&& relayout_opr = args.opr->handle()->create_operator(); + relayout_opr->exec(ts_B1, ts_B2); + } + + auto&& matmul_opr = args.opr->handle()->create_operator(); + TensorLayout A{{groups * sgh * sgw, icpg * fh * fw, ocpg}, + dtype::Float32()}; + TensorLayout B{{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + TensorLayout C{ + {groups * sgh * sgw, icpg * fh * fw, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + TensorND ts_A{args.filter_tensor->raw_ptr, A}; + TensorND ts_B{ws_pretranspose, B}; + TensorND ts_C{ws_col2im, C}; + Workspace ws_wrapper; + ws_wrapper.raw_ptr = reinterpret_cast(ws_matmul); + ws_wrapper.size = ws.get_size(2); + matmul_opr->exec(ts_A, ts_B, ts_C, ws_wrapper); + + auto&& stream = cuda_stream(args.opr->handle()); + local_share::_do_local_share_col2im( + reinterpret_cast(ws_col2im), + args.grad_tensor->ptr(), fh, fw, sh, sw, groups, + kern_param, stream); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_data/implicit_gemm.cpp b/dnn/src/cuda/local_share/backward_data/implicit_gemm.cpp new file mode 100644 index 00000000..e2ab50fa --- /dev/null +++ b/dnn/src/cuda/local_share/backward_data/implicit_gemm.cpp @@ -0,0 +1,92 @@ +/** + * \file dnn/src/cuda/local_share/backward_data/implicit_gemm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "./local_share_bwd_data.cuh" +#include "src/cuda/local_share/opr_impl.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareBackwardDataImpl::AlgoImplicitGemm::is_available( + const SizeArgs& args) const { + using Param = LocalShare::Param; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + auto&& param = args.opr->param(); + auto format = param.format; + auto sparse = param.sparse; + auto mode = param.mode; + bool available = true; + // format must be nchw + available &= (format == Format::NCHW); + // only support dense conv + available &= (sparse == Sparse::DENSE); + // mode must be cross correlation + available &= (mode == Mode::CROSS_CORRELATION); + unpack_local_share_params(args.grad_layout, args.filter_layout, + args.diff_layout, param); + available &= (ho % sgh == 0 && wo % sgw == 0); + // not support dilated convolution + available &= (dh == 1 && dw == 1); + available &= (co % 4 == 0); + auto filter_dtype = args.filter_layout.dtype, + diff_dtype = args.diff_layout.dtype, + grad_dtype = args.grad_layout.dtype; + // only support float32 + available &= (filter_dtype == diff_dtype && filter_dtype == grad_dtype && + filter_dtype == dtype::Float32()); + // only support sm_60 or later + available &= is_compute_capability_required(6, 0); + + return available; +} + +size_t +LocalShareBackwardDataImpl::AlgoImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.grad_layout, args.filter_layout, + args.diff_layout, param); + size_t ws_size_grad = n * ci * hi * wi * args.grad_layout.dtype.size(); + size_t ws_size_diff = n * co * ho * wo * args.diff_layout.dtype.size(); + return ws_size_grad + ws_size_diff; +} + +void LocalShareBackwardDataImpl::AlgoImplicitGemm::exec( + const ExecArgs& args) const { + local_share::Param kern_param; + auto&& param = args.opr->param(); + unpack_local_share_params(args.grad_layout, args.filter_layout, + args.diff_layout, param); + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + auto&& handle = concrete_handle(args.opr->handle()); + auto&& cublas_hdl = cublas_handle(args.opr->handle()); + auto&& stream = cuda_stream(args.opr->handle()); + + auto one = handle->one_device(); + auto zero = handle->zero_device(); + + local_share_bwd_data::_do_local_share_bwd_data_implicit_gemm( + args.filter_tensor->ptr(), + args.diff_tensor->ptr(), + args.grad_tensor->ptr(), + reinterpret_cast(args.workspace.raw_ptr), fh, fw, sh, sw, + kern_param, cublas_hdl, stream, one, zero); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_data/local_share_bwd_data.cuh b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data.cuh new file mode 100644 index 00000000..45ba4148 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data.cuh @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/local_share/backward_data/local_share_bwd_data.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local_share/helper.cuh" + +namespace megdnn { +namespace cuda { +namespace local_share_bwd_data { + +void _do_local_share_bwd_data_implicit_gemm( + const float* d_filter, const float* d_diff, float* d_grad, + float* workspace, int fh, int fw, int sh, int sw, + const local_share::Param& param, cublasHandle_t cublas_handle, + cudaStream_t stream, float* one, float* zero); + +} // namespace local_share_bwd_data +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu new file mode 100644 index 00000000..e4a62462 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu @@ -0,0 +1,600 @@ +/** + * \file dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./local_share_bwd_data.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace local_share; + +namespace { +template +struct UnrollConfig { + static int const unroll_ci = unroll_ci_; + static int const unroll_co = unroll_co_; + static int const unroll_n = unroll_n_; +}; + +template +struct ThreadConfig { + static int const nr_thread_x = thread_x; + static int const nr_thread_y = thread_y; + static int const nr_threads = nr_thread_x * nr_thread_y; +}; + +template +struct DiffTileCount { + static int const tile_batch = + UnrollConfig::unroll_n * ThreadConfig::nr_thread_x; + + static int const load_x = tile_batch > 32 ? 32 : tile_batch; + static int const load_y = ThreadConfig::nr_threads / load_x; + + static int const smem_h = UnrollConfig::unroll_co; + static int const smem_w = tile_batch; + static int const smem_stride = smem_w % 2 == 0 ? smem_w + 1 : smem_w; + static int const smem_tot = smem_h * smem_stride; + + static int const reg_row = (smem_h + load_y - 1) / load_y; + static int const reg_col = (smem_w + load_x - 1) / load_x; + static bool const check_sh_bounds = smem_w % load_x != 0; +}; + +template +struct FilterTileCount { + static int const tile_ci = + ThreadConfig::nr_thread_y * UnrollConfig::unroll_ci; + static int const smem_h = tile_ci; + static int const smem_w = UnrollConfig::unroll_co; + static int const smem_stride = smem_w % 2 == 0 ? smem_w + 1 : smem_w; + static int const smem_tot = smem_h * smem_stride; + + static int const load_x = + UnrollConfig::unroll_co > 32 ? 32 : UnrollConfig::unroll_co; + static int const load_y = ThreadConfig::nr_threads / load_x; + + static int const reg_row = (smem_h + load_y - 1) / load_y; + static int const reg_col = (smem_w + load_x - 1) / load_x; + static bool const check_bounds_h = smem_h % load_y != 0; + static bool const check_bounds_w = smem_w % load_x != 0; +}; + +template +struct DiffGlobal2ShareMemVisitor { + typedef DiffTileCount TileCount; + typedef float copy_t; + float* smem; + const copy_t* g_ptr; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_row][TileCount::reg_col]; + + __device__ DiffGlobal2ShareMemVisitor(copy_t* smem, int stride, int remain) + : smem{smem}, stride{stride}, remain{remain} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr(h_idx, w_idx)) = val; + } else { + *(sh_ptr(h_idx, w_idx)) = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + *(sh_ptr(h_idx, w_idx)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_co * stride; + } +}; + +template +struct FilterGlobal2ShareMemVisitor { + typedef FilterTileCount TileCount; + typedef float copy_t; + float* smem; + const copy_t* g_ptr; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_row][TileCount::reg_col]; + + __device__ FilterGlobal2ShareMemVisitor(copy_t* smem, int stride, + int remain) + : smem{smem}, stride{stride}, remain{remain} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_bounds_w && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (h_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr(h_idx, w_idx)) = val; + } else { + *(sh_ptr(h_idx, w_idx)) = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_bounds_w && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (h_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_bounds_w && w_idx >= TileCount::smem_w) + continue; + *(sh_ptr(h_idx, w_idx)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_co; + } +}; + +template +__device__ __forceinline__ void consume_block( + DiffGlobal2ShareMemVisitor& + diff_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& + filter_gl2sh_visitor, + float r_diff[UnrollConfig::unroll_n], + float r_filter[UnrollConfig::unroll_ci], + float r_grad[UnrollConfig::unroll_ci][UnrollConfig::unroll_n]) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + +#pragma unroll + for (int co_inner = 0; co_inner < UnrollConfig::unroll_co; ++co_inner) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_n; ++i) { + r_diff[i] = *(diff_gl2sh_visitor.sh_ptr( + co_inner, tidx + i * ThreadConfig::nr_thread_x)); + } +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_ci; ++j) { + r_filter[j] = *(filter_gl2sh_visitor.sh_ptr( + tidy + j * ThreadConfig::nr_thread_y, co_inner)); + } +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + r_grad[i][j] += r_diff[j] * r_filter[i]; + } + } + } +} + +template +__global__ void local_share_bwd_data_device_template_f32( + const float* __restrict__ filter, const float* __restrict__ diff, + float* __restrict__ grad, Param param, int fh, int fw, int sh, int sw) { + typedef DiffTileCount DiffTileCount; + typedef FilterTileCount FilterTileCount; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + const int b_hi = bidx / param.wi; + const int b_wi = bidx - param.wi * b_hi; + + const int b_batch = bidy * DiffTileCount::tile_batch; + const int b_ci = bidz * FilterTileCount::tile_ci; + const int t_batch = tidx + b_batch; + const int t_ci = tidy + b_ci; + + const int ho = param.sgh * param.grp_ho; + const int wo = param.sgw * param.grp_wo; + + extern __shared__ float smem[]; + float* sh_diff = smem; + float* sh_filter = smem + DiffTileCount::smem_tot; + + const float* __restrict__ g_ptr_diff = diff + b_batch; + const float* __restrict__ g_ptr_filter = + filter + b_ci * fh * fw * param.co; // input channel stride + float* __restrict__ g_ptr_grad = + grad + t_ci * param.hi * param.wi * param.n // input channel stride + + (b_hi * param.wi + b_wi) * param.n // spatial stride + + t_batch; // batch stride + + DiffGlobal2ShareMemVisitor + diff_gl2sh_visitor{sh_diff, ho * wo * param.n, param.n - b_batch}; + FilterGlobal2ShareMemVisitor + filter_gl2sh_visitor{sh_filter, param.co * fh * fw, + param.ci - b_ci}; + + float r_diff[UnrollConfig::unroll_n]; + float r_filter[UnrollConfig::unroll_ci]; + float r_grad[UnrollConfig::unroll_ci][UnrollConfig::unroll_n]; + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + r_grad[i][j] = 0.f; + } + } + + int height_start = b_hi + param.ph - fh + sh; + int width_start = b_wi + param.pw - fw + sw; + height_start = height_start >= 0 ? height_start / sh : 0; + width_start = width_start >= 0 ? width_start / sw : 0; + int height_end = (b_hi + param.ph) / sh; + int width_end = (b_wi + param.pw) / sw; + height_end = height_end < ho ? height_end : ho - 1; + width_end = width_end < wo ? width_end : wo - 1; + int nr_elems_per_filter_grp = param.ci * param.co * fh * fw; + const int co_blks = + (param.co + UnrollConfig::unroll_co - 1) / UnrollConfig::unroll_co; + + int kh = b_hi + param.ph - height_start * sh; + int kw = b_wi + param.pw - width_start * sw; + int sgh_idx = height_start / param.grp_ho; + int sgw_idx = width_start / param.grp_wo; + diff_gl2sh_visitor.g_ptr = + g_ptr_diff + (height_start * wo + width_start) * param.n; + filter_gl2sh_visitor.g_ptr = + g_ptr_filter + + (sgh_idx * param.sgw + sgw_idx) * nr_elems_per_filter_grp + + (kh * fw + kw) * param.co; + + if (height_start <= height_end && width_start <= width_end) { + diff_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + __syncthreads(); + } + + for (int h = height_start; h <= height_end; ++h) { + for (int w = width_start; w <= width_end; ++w) { + for (int co_outer = 0; co_outer < co_blks; co_outer++) { + if (co_outer == co_blks - 1) { + // not last tile + if (!(h == height_end && w == width_end)) { + int w_next = w == width_end ? width_start : w + 1; + int h_next = w == width_end ? h + 1 : h; + int kh = b_hi + param.ph - h_next * sh; + int kw = b_wi + param.pw - w_next * sw; + + int sgh_idx = h_next / param.grp_ho; + int sgw_idx = w_next / param.grp_wo; + diff_gl2sh_visitor.g_ptr = + g_ptr_diff + (h_next * wo + w_next) * param.n; + filter_gl2sh_visitor.g_ptr = + g_ptr_filter + + (sgh_idx * param.sgw + sgw_idx) * + nr_elems_per_filter_grp + + (kh * fw + kw) * param.co; + diff_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + } else { + diff_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + diff_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + + consume_block( + diff_gl2sh_visitor, filter_gl2sh_visitor, r_diff, + r_filter, r_grad); + + // last tile + if (!(h == height_end && w == width_end && + co_outer == co_blks - 1)) { + __syncthreads(); + diff_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } + + const int ci_stride = param.hi * param.wi * param.n; + // store +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + if (check_bounds && + (t_batch + j * ThreadConfig::nr_thread_x >= param.n || + t_ci + i * ThreadConfig::nr_thread_y >= param.ci)) { + } else { + g_ptr_grad[j * ThreadConfig::nr_thread_x + + i * ThreadConfig::nr_thread_y * ci_stride] = + r_grad[i][j]; + } + } + } +} + +void (*get_kern(const Param& param, LaunchConfig& launch_config))( + const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int) { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int); + kern = nullptr; +#define CHK3(n_, ci_, co_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.ci >= ci_) { \ + if (param.co % co_ == 0) { \ + static constexpr int unroll_ci = (ci_ + ty_ - 1) / ty_; \ + static constexpr int unroll_co = co_; \ + static constexpr int unroll_n = (n_ + tx_ - 1) / tx_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef UnrollConfig \ + UnrollConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef DiffTileCount \ + DiffTileCount; \ + typedef FilterTileCount \ + FilterTileCount; \ + kern = local_share_bwd_data_device_template_f32< \ + true, UnrollConfig, ThreadConfig>; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.hi * param.wi; \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, DiffTileCount::tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.ci, FilterTileCount::tile_ci); \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + (DiffTileCount::smem_tot + FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, ci_) \ + CHK3(n_, ci_, 4, 8, 16) \ + CHK3(n_, ci_, 8, 8, 16) +#define CHK2_(n_, ci_) \ + CHK3(n_, ci_, 4, 8, 8) \ + CHK3(n_, ci_, 8, 8, 8) +#define CHK(n_) \ + CHK2_(n_, 1) \ + CHK2_(n_, 8) CHK2_(n_, 16) CHK2_(n_, 32) CHK2_(n_, 64) CHK2(n_, 128) + CHK(1) + CHK(8); + CHK(16); + CHK(32); + CHK(64); +#undef CHK +#undef CHK2 +#undef CHK2_ +#undef CHK3 +#define CHK3(n_, ci_, co_, tx_, ty_) \ + if (param.n % n_ == 0) { \ + if (param.ci % ci_ == 0) { \ + if (param.co % co_ == 0) { \ + static constexpr int unroll_ci = (ci_) / (ty_); \ + static constexpr int unroll_co = co_; \ + static constexpr int unroll_n = (n_) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef UnrollConfig \ + UnrollConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef DiffTileCount \ + DiffTileCount; \ + typedef FilterTileCount \ + FilterTileCount; \ + kern = local_share_bwd_data_device_template_f32< \ + false, UnrollConfig, ThreadConfig>; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = param.hi * param.wi; \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, DiffTileCount::tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.ci, FilterTileCount::tile_ci); \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + (DiffTileCount::smem_tot + FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, ci_) CHK3(n_, ci_, 4, 8, 8) CHK3(n_, ci_, 8, 8, 8) CHK3(n_, ci_, 16, 8, 8) +#define CHK(n_) \ + CHK2(n_, 8) \ + CHK2(n_, 16) \ + CHK2(n_, 32) CHK2(n_, 64) CHK3(n_, 128, 4, 8, 16) CHK3(n_, 128, 8, 8, 16) CHK3(n_, 128, 16, 8, 16) + CHK(8); + CHK(16); + CHK(32); + CHK(64); +#undef CHK +#undef CHK2 +#undef CHK3 + megdnn_assert(kern != nullptr, + "no usable kernel implementation for local share " + "backward data (batch,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +void megdnn::cuda::local_share_bwd_data::_do_local_share_bwd_data_implicit_gemm( + const float* d_filter, const float* d_diff, float* d_grad, + float* workspace, int fh, int fw, int sh, int sw, const Param& param, + cublasHandle_t cublas_handle, cudaStream_t stream, float* one, + float* zero) { + int ho = param.grp_ho * param.sgh, wo = param.grp_wo * param.sgw; + size_t nr_grad_total = param.n * param.ci * param.hi * param.wi; + float* ws_grad = workspace; + float* ws_diff = workspace + nr_grad_total; + // tensor reformat from (n, c, h, w) -> (c, h, w, n) + { + int m = param.n, n = param.co * ho * wo; + int lda, ldb; + lda = ldb = param.co * ho * wo; + int ldc = param.n; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, d_diff, lda, zero, d_diff, ldb, ws_diff, + ldc)); + } + + { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int); + LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + _check_launch_config(launch_config); + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + kern<<>>( + d_filter, ws_diff, ws_grad, param, fh, fw, sh, sw); + after_kernel_launch(); + } + + // tensor reformat form (c, h, w, n) -> (n, c, h, w) + { + int m = param.ci * param.hi * param.wi, n = param.n; + int lda, ldb; + lda = ldb = param.n; + int ldc = param.ci * param.hi * param.wi; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, ws_grad, lda, zero, ws_grad, ldb, d_grad, + ldc)); + } +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/backward_filter/algo.cpp b/dnn/src/cuda/local_share/backward_filter/algo.cpp new file mode 100644 index 00000000..0513aeee --- /dev/null +++ b/dnn/src/cuda/local_share/backward_filter/algo.cpp @@ -0,0 +1,55 @@ +/** + * \file dnn/src/cuda/local_share/backward_filter/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +LocalShareBackwardFilterImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&implicit_gemm); + all_algos.push_back(&batched_matmul); +} + +LocalShareBackwardFilterImpl::AlgoPack LocalShareBackwardFilterImpl::sm_algo_pack; + +LocalShareBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( + LocalShareBackwardFilterImpl* o, const TensorLayout& src, + const TensorLayout& diff, const TensorLayout& grad) + : opr{o}, src_layout{src}, diff_layout{diff}, grad_layout{grad} {} + +LocalShareBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs(LocalShareBackwardFilterImpl* opr, + _megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, diff.layout, grad.layout), + src_tensor{&src}, + diff_tensor{&diff}, + grad_tensor{&grad}, + workspace{workspace} {} + +std::string LocalShareBackwardFilterImpl::AlgoBase::SizeArgs::to_string() + const { + auto&& param = opr->param(); + MEGDNN_MARK_USED_VAR(param); + return megdnn_mangle(ssprintf( + "src=%s, diff=%s, grad=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s->%s", + src_layout.to_string().c_str(), diff_layout.to_string().c_str(), + grad_layout.to_string().c_str(), param.pad_h, param.pad_w, + param.stride_h, param.stride_w, param.dilate_h, param.dilate_w, + static_cast(param.mode), src_layout.dtype.name(), + diff_layout.dtype.name(), grad_layout.dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_filter/algo.h b/dnn/src/cuda/local_share/backward_filter/algo.h new file mode 100644 index 00000000..634f1203 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_filter/algo.h @@ -0,0 +1,108 @@ +/** + * \file dnn/src/cuda/local_share/backward_filter/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/handle.h" +#include "src/cuda/local_share/opr_impl.h" + +namespace megdnn { +namespace cuda { + +class LocalShareBackwardFilterImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + LocalShareBackwardFilterImpl* opr; + TensorLayout src_layout, diff_layout, grad_layout; + + std::string to_string() const; + SizeArgs(LocalShareBackwardFilterImpl* opr, const TensorLayout& src, + const TensorLayout& diff, const TensorLayout& grad); + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(LocalShareBackwardFilterImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "local share conv fwd algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class LocalShareBackwardFilterImpl::AlgoImplicitGemm final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { return "LOCAL_SHARE_IMPLICIT_GEMM"; } +}; + +class LocalShareBackwardFilterImpl::AlgoBatchedMatMul final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { return "LOCAL_SHARE_BATCHED_MATMUL"; } +}; + +class LocalShareBackwardFilterImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + AlgoImplicitGemm implicit_gemm; + AlgoBatchedMatMul batched_matmul; + + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp b/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp new file mode 100644 index 00000000..ed73d039 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp @@ -0,0 +1,147 @@ +/** + * \file dnn/src/cuda/local_share/backward_filter/batched_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "src/cuda/local_share/im2col.cuh" +#include "src/cuda/local_share/opr_impl.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareBackwardFilterImpl::AlgoBatchedMatMul::is_available( + const SizeArgs& args) const { + using Param = LocalShare::Param; + using Format = Param::Format; + using Mode = Param::Mode; + auto&& param = args.opr->param(); + auto format = param.format; + auto mode = param.mode; + bool available = true; + // format must be nchw + available &= (format == Format::NCHW); + // mode must be cross correlation + available &= (mode == Mode::CROSS_CORRELATION); + auto src_dtype = args.src_layout.dtype, diff_dtype = args.diff_layout.dtype, + grad_dtype = args.grad_layout.dtype; + // only support float32 + available &= (src_dtype == diff_dtype && src_dtype == grad_dtype && + src_dtype == dtype::Float32()); + // do not support dilate conv + size_t dh = param.dilate_h, dw = param.dilate_w; + available &= (dh == 1 && dw == 1); + return available; +} + +WorkspaceBundle +LocalShareBackwardFilterImpl::AlgoBatchedMatMul::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.grad_layout, + args.diff_layout, param); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + size_t groups = 1; + if (param.sparse == Sparse::GROUP) { + groups = args.grad_layout.shape[0]; + } + size_t icpg = ci / groups, ocpg = co / groups; + size_t ws_im2col = + n * ci * ho * wo * fh * fw * args.src_layout.dtype.size(); + size_t ws_pretranspose = n * co * ho * wo * args.diff_layout.dtype.size(); + auto&& matmul_opr = args.opr->handle()->create_operator(); + matmul_opr->param().transposeA = true; + matmul_opr->param().transposeB = true; + TensorLayout A{ + {groups * sgh * sgw, ho / sgh * wo / sgw * n, icpg * fh * fw}, + dtype::Float32()}; + TensorLayout B{{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + TensorLayout C{{groups * sgh * sgw, icpg * fh * fw, ocpg}, + dtype::Float32()}; + size_t ws_matmul = matmul_opr->get_workspace_in_bytes(A, B, C); + WorkspaceBundle ws{raw_ptr, {ws_im2col, ws_pretranspose, ws_matmul}}; + return ws; +} + +size_t LocalShareBackwardFilterImpl::AlgoBatchedMatMul::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void LocalShareBackwardFilterImpl::AlgoBatchedMatMul::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.grad_layout, + args.diff_layout, param); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + size_t groups = 1; + if (param.sparse == Sparse::GROUP) { + groups = args.grad_layout.shape[0]; + } + size_t icpg = ci / groups, ocpg = co / groups; + local_share::Param kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + + auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ws_im2col = ws.get(0); + auto ws_pretranspose = ws.get(1); + auto ws_matmul = ws.get(2); + auto&& stream = cuda_stream(args.opr->handle()); + local_share::_do_local_share_im2col( + args.src_tensor->ptr(), + reinterpret_cast(ws_im2col), fh, fw, sh, sw, groups, + kern_param, stream); + + { + TensorLayout B1{{groups, sgh, sgw, ocpg, n, ho / sgh, wo / sgw}, + dtype::Float32()}; + B1.stride[0] = wo * ho * ocpg; + B1.stride[1] = wo * ho / sgh; + B1.stride[2] = wo / sgw; + B1.stride[3] = ho * wo; + B1.stride[4] = co * ho * wo; + B1.stride[5] = wo; + B1.stride[6] = 1; + TensorND ts_B1{args.diff_tensor->raw_ptr, B1}; + TensorLayout B2{{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + B2.init_contiguous_stride(); + TensorND ts_B2{ws_pretranspose, B2}; + auto&& relayout_opr = args.opr->handle()->create_operator(); + relayout_opr->exec(ts_B1, ts_B2); + } + + auto&& matmul_opr = args.opr->handle()->create_operator(); + matmul_opr->param().transposeA = true; + matmul_opr->param().transposeB = true; + TensorLayout A{ + {groups * sgh * sgw, ho / sgh * wo / sgw * n, icpg * fh * fw}, + dtype::Float32()}; + TensorLayout B{{groups * sgh * sgw, ocpg, ho / sgh * wo / sgw * n}, + dtype::Float32()}; + TensorLayout C{{groups * sgh * sgw, icpg * fh * fw, ocpg}, + dtype::Float32()}; + TensorND ts_A{ws_im2col, A}; + TensorND ts_B{ws_pretranspose, B}; + TensorND ts_C{args.grad_tensor->raw_ptr, C}; + Workspace ws_wrapper; + ws_wrapper.raw_ptr = reinterpret_cast(ws_matmul); + ws_wrapper.size = ws.get_size(2); + matmul_opr->exec(ts_A, ts_B, ts_C, ws_wrapper); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_filter/implicit_gemm.cpp b/dnn/src/cuda/local_share/backward_filter/implicit_gemm.cpp new file mode 100644 index 00000000..eca5aead --- /dev/null +++ b/dnn/src/cuda/local_share/backward_filter/implicit_gemm.cpp @@ -0,0 +1,90 @@ +/** + * \file dnn/src/cuda/local_share/backward_filter/implicit_gemm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "./local_share_bwd_filter.cuh" +#include "src/cuda/local_share/opr_impl.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareBackwardFilterImpl::AlgoImplicitGemm::is_available( + const SizeArgs& args) const { + using Param = LocalShare::Param; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + auto&& param = args.opr->param(); + auto format = param.format; + auto sparse = param.sparse; + auto mode = param.mode; + bool available = true; + // format must be nchw + available &= (format == Format::NCHW); + // only support dense conv + available &= (sparse == Sparse::DENSE); + // mode must be cross correlation + available &= (mode == Mode::CROSS_CORRELATION); + unpack_local_share_params(args.src_layout, args.grad_layout, + args.diff_layout, param); + available &= (ho % sgh == 0 && wo % sgw == 0); + // not support dilated convolution + available &= (dh == 1 && dw == 1); + available &= (n % 4 == 0); + auto src_dtype = args.src_layout.dtype, diff_dtype = args.diff_layout.dtype, + grad_dtype = args.grad_layout.dtype; + // only support float32 + available &= (src_dtype == diff_dtype && src_dtype == grad_dtype && + src_dtype == dtype::Float32()); + // only support sm_60 or later + available &= is_compute_capability_required(6, 0); + + return available; +} + +size_t LocalShareBackwardFilterImpl::AlgoImplicitGemm::get_workspace_in_bytes( + const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.grad_layout, + args.diff_layout, param); + size_t ws_size_src = n * ci * hi * wi * args.grad_layout.dtype.size(); + size_t ws_size_diff = n * co * ho * wo * args.diff_layout.dtype.size(); + return ws_size_src + ws_size_diff; +} + +void LocalShareBackwardFilterImpl::AlgoImplicitGemm::exec( + const ExecArgs& args) const { + local_share::Param kern_param; + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.grad_layout, + args.diff_layout, param); + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + auto&& handle = concrete_handle(args.opr->handle()); + auto&& cublas_hdl = cublas_handle(args.opr->handle()); + auto&& stream = cuda_stream(args.opr->handle()); + + auto one = handle->one_device(); + auto zero = handle->zero_device(); + + local_share_bwd_filter::_do_local_share_bwd_filter_implicit_gemm( + args.src_tensor->ptr(), + args.diff_tensor->ptr(), + args.grad_tensor->ptr(), + reinterpret_cast(args.workspace.raw_ptr), fh, fw, sh, sw, + kern_param, cublas_hdl, stream, one, zero); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter.cuh b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter.cuh new file mode 100644 index 00000000..7fac515d --- /dev/null +++ b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter.cuh @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local_share/helper.cuh" + +namespace megdnn { +namespace cuda { +namespace local_share_bwd_filter { + +void _do_local_share_bwd_filter_implicit_gemm( + const float* d_src, const float* d_diff, float* d_grad, + float* workspace, int fh, int fw, int sh, int sw, + const local_share::Param& param, cublasHandle_t cublas_handle, + cudaStream_t stream, float* one, float* zero); + +} // namespace local_share_bwd_filter +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu new file mode 100644 index 00000000..bd872ce0 --- /dev/null +++ b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu @@ -0,0 +1,526 @@ +/** + * \file dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./local_share_bwd_filter.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace local_share; + +namespace { +template +struct UnrollConfig { + static int const unroll_ci = unroll_ci_; + static int const unroll_co = unroll_co_; + static int const unroll_n = unroll_n_; +}; + +template +struct ThreadConfig { + static int const nr_thread_x = thread_x; + static int const nr_thread_y = thread_y; + static int const nr_threads = nr_thread_x * nr_thread_y; +}; + +template +struct DiffTileCount { + static int const tile_batch = UnrollConfig::unroll_n; + static int const tile_co = + UnrollConfig::unroll_co * ThreadConfig::nr_thread_x; + + static int const load_x = tile_batch > 32 ? 32 : tile_batch; + static int const load_y = ThreadConfig::nr_threads / load_x; + + static int const smem_h = tile_co; + static int const smem_w = tile_batch; + static int const smem_stride = smem_w % 2 == 0 ? smem_w + 1 : smem_w; + static int const smem_tot = smem_h * smem_stride; + + static int const reg_row = (smem_h + load_y - 1) / load_y; + static int const reg_col = (smem_w + load_x - 1) / load_x; + static bool const check_bounds_h = smem_h % load_y != 0; + static bool const check_bounds_w = smem_w % load_x != 0; +}; + +template +struct DataTileCount { + static int const tile_batch = UnrollConfig::unroll_n; + static int const tile_ci = + ThreadConfig::nr_thread_y * UnrollConfig::unroll_ci; + + static int const load_x = tile_batch > 32 ? 32 : tile_batch; + static int const load_y = ThreadConfig::nr_threads / load_x; + + static int const smem_h = tile_ci; + static int const smem_w = tile_batch; + static int const smem_stride = smem_w % 2 == 0 ? smem_w + 1 : smem_w; + static int const smem_tot = smem_h * smem_stride; + + static int const reg_row = (smem_h + load_y - 1) / load_y; + static int const reg_col = (smem_w + load_x - 1) / load_x; + static bool const check_bounds_h = smem_h % load_y != 0; + static bool const check_bounds_w = smem_w % load_x != 0; +}; + +template +struct Global2ShareMemVisitor { + typedef float copy_t; + float* smem; + const copy_t* g_ptr; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * blockDim.x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_row][TileCount::reg_col]; + + __device__ Global2ShareMemVisitor(copy_t* smem, int stride, int remain) + : smem{smem}, stride{stride}, remain{remain} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_bounds_w && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (h_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr(h_idx, w_idx)) = val; + } else { + *(sh_ptr(h_idx, w_idx)) = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_bounds_w && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (h_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (TileCount::check_bounds_h && h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_bounds_w && w_idx >= TileCount::smem_w) + continue; + *(sh_ptr(h_idx, w_idx)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += TileCount::tile_batch; + } +}; + +template +__device__ __forceinline__ void consume_block( + Global2ShareMemVisitor>& + src_gl2sh_visitor, + Global2ShareMemVisitor>& + diff_gl2sh_visitor, + float r_src[UnrollConfig::unroll_ci], + float r_diff[UnrollConfig::unroll_co], + float r_grad[UnrollConfig::unroll_ci][UnrollConfig::unroll_co]) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + +#pragma unroll + for (int b_inner = 0; b_inner < UnrollConfig::unroll_n; ++b_inner) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { + r_src[i] = *(src_gl2sh_visitor.sh_ptr( + tidy + i * ThreadConfig::nr_thread_y, b_inner)); + } +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_co; ++j) { + r_diff[j] = *(diff_gl2sh_visitor.sh_ptr( + tidx + j * ThreadConfig::nr_thread_x, b_inner)); + } +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_co; ++j) { + r_grad[i][j] += r_src[i] * r_diff[j]; + } + } + } +} + +template +__global__ void local_share_bwd_filter_device_template_f32( + const float* __restrict__ src, const float* __restrict__ diff, + float* __restrict__ grad, Param param, int fh, int fw, int sh, int sw) { + typedef DiffTileCount DiffTileCount; + typedef DataTileCount DataTileCount; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + const int filter_sizes = fh * fw; + const int sp_grp_idx = bidx / filter_sizes; + const int kern_spatial = bidx - sp_grp_idx * filter_sizes; + const int sgh_idx = sp_grp_idx / param.sgw; + const int sgw_idx = sp_grp_idx - sgh_idx * param.sgw; + const int kh = kern_spatial / fw; + const int kw = kern_spatial - kh * fw; + + const int b_co = bidy * DiffTileCount::tile_co; + const int b_ci = bidz * DataTileCount::tile_ci; + + const int t_co = tidx + b_co; + const int t_ci = tidy + b_ci; + + const int ho = param.sgh * param.grp_ho; + const int wo = param.sgw * param.grp_wo; + + extern __shared__ float smem[]; + float* sh_src = smem; + float* sh_diff = smem + DataTileCount::smem_tot; + + const float* __restrict__ g_ptr_src = + src + b_ci * param.hi * param.wi * param.n; // input channel stride + const float* __restrict__ g_ptr_diff = diff + b_co * ho * wo * param.n; + float* __restrict__ g_ptr_grad = + grad + + sp_grp_idx * filter_sizes * param.co * + param.ci // spatial group stride + + t_ci * filter_sizes * param.co // input channel stride + + kern_spatial * param.co // kernel spatial stride + + t_co; // output channel stride + + Global2ShareMemVisitor src_gl2sh_visitor{ + sh_src, param.hi * param.wi * param.n, param.ci - b_ci}; + Global2ShareMemVisitor diff_gl2sh_visitor{ + sh_diff, ho * wo * param.n, param.co - b_co}; + + float r_src[UnrollConfig::unroll_ci]; + float r_diff[UnrollConfig::unroll_co]; + float r_grad[UnrollConfig::unroll_ci][UnrollConfig::unroll_co]; + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_co; ++j) { + r_grad[i][j] = 0.f; + } + } + + int sp_grp_h_start = sgh_idx * param.grp_ho; + int sp_grp_h_end = sgh_idx * param.grp_ho + param.grp_ho - 1; + int sp_grp_w_start = sgw_idx * param.grp_wo; + int sp_grp_w_end = sgw_idx * param.grp_wo + param.grp_wo - 1; + int height_start = (param.ph - kh + sh - 1) / sh; + height_start = + sp_grp_h_start >= height_start ? sp_grp_h_start : height_start; + int width_start = (param.pw - kw + sw - 1) / sw; + width_start = sp_grp_w_start >= width_start ? sp_grp_w_start : width_start; + int height_end = (param.hi - 1 + param.ph - kh) / sh; + height_end = sp_grp_h_end <= height_end ? sp_grp_h_end : height_end; + int width_end = (param.wi - 1 + param.pw - kw) / sw; + width_end = sp_grp_w_end <= width_end ? sp_grp_w_end : width_end; + + const int b_blks = + (param.n + UnrollConfig::unroll_n - 1) / UnrollConfig::unroll_n; + + int ih_idx = height_start * sh - param.ph + kh; + int iw_idx = width_start * sw - param.pw + kw; + src_gl2sh_visitor.g_ptr = + g_ptr_src + (ih_idx * param.wi + iw_idx) * param.n; + diff_gl2sh_visitor.g_ptr = + g_ptr_diff + (height_start * wo + width_start) * param.n; + + if (height_start <= height_end && width_start <= width_end) { + src_gl2sh_visitor.first_copy(); + diff_gl2sh_visitor.first_copy(); + __syncthreads(); + } + + for (int h = height_start; h <= height_end; ++h) { + for (int w = width_start; w <= width_end; ++w) { + for (int b_outer = 0; b_outer < b_blks; b_outer++) { + if (b_outer == b_blks - 1) { + // not last tile + if (!(h == height_end && w == width_end)) { + int w_next = w == width_end ? width_start : w + 1; + int h_next = w == width_end ? h + 1 : h; + + int ih_idx = h_next * sh - param.ph + kh; + int iw_idx = w_next * sw - param.pw + kw; + + src_gl2sh_visitor.g_ptr = + g_ptr_src + + (ih_idx * param.wi + iw_idx) * param.n; + diff_gl2sh_visitor.g_ptr = + g_ptr_diff + (h_next * wo + w_next) * param.n; + src_gl2sh_visitor.copy(); + diff_gl2sh_visitor.copy(); + } + } else { + src_gl2sh_visitor.move_forward(); + diff_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + diff_gl2sh_visitor.copy(); + } + + consume_block( + src_gl2sh_visitor, diff_gl2sh_visitor, r_src, r_diff, + r_grad); + + // last tile + if (!(h == height_end && w == width_end && + b_outer == b_blks - 1)) { + __syncthreads(); + src_gl2sh_visitor.commit(); + diff_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } + + const int ci_stride = fh * fw * param.co; + // store +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ci; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_co; ++j) { + if (check_bounds && + (t_co + j * ThreadConfig::nr_thread_x >= param.co || + t_ci + i * ThreadConfig::nr_thread_y >= param.ci)) { + } else { + g_ptr_grad[j * ThreadConfig::nr_thread_x + + i * ThreadConfig::nr_thread_y * ci_stride] = + r_grad[i][j]; + } + } + } +} + +void (*get_kern(const Param& param, const int filter_sizes, + LaunchConfig& launch_config))(const float* __restrict__, + const float* __restrict__, + float* __restrict__, Param, int, + int, int, int) { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int); + kern = nullptr; +#define CHK3(ci_, co_, n_, tx_, ty_) \ + if (param.ci >= ci_) { \ + if (param.co >= co_) { \ + if (param.n % n_ == 0) { \ + static constexpr int unroll_ci = (ci_ + ty_ - 1) / ty_; \ + static constexpr int unroll_co = (co_ + tx_ - 1) / tx_; \ + static constexpr int unroll_n = n_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef UnrollConfig \ + UnrollConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef DataTileCount \ + DataTileCount; \ + typedef DiffTileCount \ + DiffTileCount; \ + kern = local_share_bwd_filter_device_template_f32< \ + true, UnrollConfig, ThreadConfig>; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.sgh * param.sgw * filter_sizes; \ + launch_config.nr_blocks_y = \ + DIVUP(param.co, DiffTileCount::tile_co); \ + launch_config.nr_blocks_z = \ + DIVUP(param.ci, DataTileCount::tile_ci); \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + (DataTileCount::smem_tot + DiffTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(ci_, co_) \ + CHK3(ci_, co_, 4, 16, 8) \ + CHK3(ci_, co_, 8, 16, 8) +#define CHK2_(ci_, co_) \ + CHK3(ci_, co_, 4, 8, 8) \ + CHK3(ci_, co_, 8, 8, 8) +#define CHK(ci_) \ + CHK2_(ci_, 1) \ + CHK2_(ci_, 8) CHK2_(ci_, 16) CHK2_(ci_, 32) CHK2_(ci_, 64) CHK2(ci_, 128) + CHK(1) + CHK(8); + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK +#undef CHK2 +#undef CHK2_ +#undef CHK3 +#define CHK3(ci_, co_, n_, tx_, ty_) \ + if (param.ci % ci_ == 0) { \ + if (param.co % co_ == 0) { \ + if (param.n % n_ == 0) { \ + static constexpr int unroll_ci = (ci_) / (ty_); \ + static constexpr int unroll_co = (co_) / (tx_); \ + static constexpr int unroll_n = n_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef UnrollConfig \ + UnrollConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef DataTileCount \ + DataTileCount; \ + typedef DiffTileCount \ + DiffTileCount; \ + kern = local_share_bwd_filter_device_template_f32< \ + false, UnrollConfig, ThreadConfig>; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.sgh * param.sgw * filter_sizes; \ + launch_config.nr_blocks_y = \ + DIVUP(param.co, DiffTileCount::tile_co); \ + launch_config.nr_blocks_z = \ + DIVUP(param.ci, DataTileCount::tile_ci); \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + (DataTileCount::smem_tot + DiffTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(ci_, co_) \ + CHK3(ci_, co_, 4, 8, 8) CHK3(ci_, co_, 8, 8, 8) +#define CHK(ci_) \ + CHK2(ci_, 8) \ + CHK2(ci_, 16) \ + CHK2(ci_, 32) \ + CHK2(ci_, 64) \ + CHK3(ci_, 128, 4, 16, 8) CHK3(ci_, 128, 8, 16, 8) + CHK(8); + CHK(16); + CHK(32); + CHK(64); + CHK(128); +#undef CHK +#undef CHK2 +#undef CHK3 + megdnn_assert(kern != nullptr, + "no usable kernel implementation for local share " + "backward data (batch,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} +} // namespace + +void megdnn::cuda::local_share_bwd_filter:: + _do_local_share_bwd_filter_implicit_gemm( + const float* d_src, const float* d_diff, float* d_grad, + float* workspace, int fh, int fw, int sh, int sw, + const Param& param, cublasHandle_t cublas_handle, + cudaStream_t stream, float* one, float* zero) { + int ho = param.grp_ho * param.sgh, wo = param.grp_wo * param.sgw; + size_t nr_src_total = param.n * param.ci * param.hi * param.wi; + float* ws_src = workspace; + float* ws_diff = workspace + nr_src_total; + // tensor reformat from (n, c, h, w) -> (c, h, w, n) + { + int m = param.n, n = param.ci * param.hi * param.wi; + int lda, ldb; + lda = ldb = param.ci * param.hi * param.wi; + int ldc = param.n; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, d_src, lda, zero, d_src, ldb, ws_src, + ldc)); + } + + { + int m = param.n, n = param.co * ho * wo; + int lda, ldb; + lda = ldb = param.co * ho * wo; + int ldc = param.n; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, d_diff, lda, zero, d_diff, ldb, ws_diff, + ldc)); + } + + { + int filter_sizes = fh * fw; + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int); + LaunchConfig launch_config; + kern = get_kern(param, filter_sizes, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + _check_launch_config(launch_config); + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + kern<<>>( + ws_src, ws_diff, d_grad, param, fh, fw, sh, sw); + after_kernel_launch(); + } +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/forward/algo.cpp b/dnn/src/cuda/local_share/forward/algo.cpp new file mode 100644 index 00000000..67c13eb7 --- /dev/null +++ b/dnn/src/cuda/local_share/forward/algo.cpp @@ -0,0 +1,56 @@ +/** + * \file dnn/src/cuda/local_share/forward/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +LocalShareForwardImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&batch_size_aware_chwn_small_image); + all_algos.push_back(&batch_size_aware_chwn); + all_algos.push_back(&batched_matmul); +} + +LocalShareForwardImpl::AlgoPack LocalShareForwardImpl::sm_algo_pack; + +LocalShareForwardImpl::AlgoBase::SizeArgs::SizeArgs(LocalShareForwardImpl* o, + const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) + : opr{o}, src_layout{src}, filter_layout{filter}, dst_layout{dst} {} + +LocalShareForwardImpl::AlgoBase::ExecArgs::ExecArgs(LocalShareForwardImpl* opr, + _megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, filter.layout, dst.layout), + src_tensor{&src}, + filter_tensor{&filter}, + dst_tensor{&dst}, + workspace{workspace} {} + +std::string LocalShareForwardImpl::AlgoBase::SizeArgs::to_string() const { + auto&& param = opr->param(); + MEGDNN_MARK_USED_VAR(param); + return megdnn_mangle(ssprintf( + "src=%s, filter=%s, dst=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", + src_layout.to_string().c_str(), filter_layout.to_string().c_str(), + dst_layout.to_string().c_str(), param.pad_h, param.pad_w, + param.stride_h, param.stride_w, param.dilate_h, param.dilate_w, + static_cast(param.mode), src_layout.dtype.name(), + dst_layout.dtype.name())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/forward/algo.h b/dnn/src/cuda/local_share/forward/algo.h new file mode 100644 index 00000000..b41ec58d --- /dev/null +++ b/dnn/src/cuda/local_share/forward/algo.h @@ -0,0 +1,129 @@ +/** + * \file dnn/src/cuda/local_share/forward/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/oprs.h" + +#include "src/common/utils.h" +#include "src/cuda/handle.h" +#include "src/cuda/local_share/opr_impl.h" + +namespace megdnn { +namespace cuda { + +class LocalShareForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + LocalShareForwardImpl* opr; + TensorLayout src_layout, filter_layout, dst_layout; + + std::string to_string() const; + SizeArgs(LocalShareForwardImpl* opr, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& dst); + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *filter_tensor, *dst_tensor; + Workspace workspace; + + ExecArgs(LocalShareForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in filter, _megdnn_tensor_out dst, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert(req <= workspace.size, + "local share conv fwd algo %s: required workspace %zu " + "bytes, got %zu", + name(), req, workspace.size); + return *this; + } +}; + +class LocalShareForwardImpl::AlgoCHWNBatchSizeAware final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE"; + } +}; + +class LocalShareForwardImpl::AlgoCHWNBatchSizeAwareSmallImage final + : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "LOCAL_SHARE_CHWN_BATCH_SIZE_AWARE_SMALL_IMAGE"; + } +}; + +class LocalShareForwardImpl::AlgoBatchedMatMul final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + void exec(const ExecArgs& args) const override; + + bool is_reproducible() const override { return true; } + + const char* name() const override { return "LOCAL_SHARE_BATCHED_MATMUL"; } +}; + +class LocalShareForwardImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + + AlgoCHWNBatchSizeAware batch_size_aware_chwn; + AlgoCHWNBatchSizeAwareSmallImage batch_size_aware_chwn_small_image; + AlgoBatchedMatMul batched_matmul; + + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/forward/batch_size_aware_chwn.cpp b/dnn/src/cuda/local_share/forward/batch_size_aware_chwn.cpp new file mode 100644 index 00000000..f4620e72 --- /dev/null +++ b/dnn/src/cuda/local_share/forward/batch_size_aware_chwn.cpp @@ -0,0 +1,104 @@ +/** + * \file dnn/src/cuda/local_share/forward/batch_size_aware_chwn.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "./local_share_forward.cuh" +#include "src/cuda/local_share/opr_impl.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareForwardImpl::AlgoCHWNBatchSizeAware::is_available( + const SizeArgs& args) const { + using Param = LocalShare::Param; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + auto&& param = args.opr->param(); + auto format = param.format; + auto sparse = param.sparse; + auto mode = param.mode; + bool available = true; + // format must be nchw + available &= (format == Format::NCHW); + // only support dense conv + available &= (sparse == Sparse::DENSE); + // mode must be cross correlation + available &= (mode == Mode::CROSS_CORRELATION); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + available &= (ho % sgh == 0 && wo % sgw == 0); + // not support dilated convolution + available &= (dh == 1 && dw == 1); + available &= (n % 32 == 0); + // kernel size should be 3, 5, 7 + available &= (fh == 1 && fw == 1) || (fh == 3 && fw == 3) || + (fh == 5 && fw == 5) || (fh == 7 || fw == 7); + // stride should be 1 or 2 + available &= (sh == sw && (sh == 1 || sh == 2)); + available &= (ci % 4 == 0) || (fh == 3 && ci % 2 == 0); + auto src_dtype = args.src_layout.dtype, + filter_dtype = args.filter_layout.dtype, + dst_dtype = args.dst_layout.dtype; + // only support float32 + available &= (src_dtype == filter_dtype && src_dtype == dst_dtype && + src_dtype == dtype::Float32()); + // only support sm_60 or later + available &= is_compute_capability_required(6, 0); + + return available; +} + +WorkspaceBundle +LocalShareForwardImpl::AlgoCHWNBatchSizeAware::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + size_t ws_size_src = n * ci * hi * wi * args.src_layout.dtype.size(); + size_t ws_size_dst = n * co * ho * wo * args.dst_layout.dtype.size(); + WorkspaceBundle ws{raw_ptr, {ws_size_src, ws_size_dst}}; + return ws; +} + +size_t LocalShareForwardImpl::AlgoCHWNBatchSizeAware::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void LocalShareForwardImpl::AlgoCHWNBatchSizeAware::exec( + const ExecArgs& args) const { + local_share::Param kern_param; + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + auto&& handle = concrete_handle(args.opr->handle()); + auto&& cublas_hdl = cublas_handle(args.opr->handle()); + auto&& stream = cuda_stream(args.opr->handle()); + + auto one = handle->one_device(); + auto zero = handle->zero_device(); + + local_share::_do_local_share_convolution_large_batch_size( + args.src_tensor->ptr(), + args.filter_tensor->ptr(), + args.dst_tensor->ptr(), + reinterpret_cast(args.workspace.raw_ptr), fh, fw, sh, sw, + kern_param, cublas_hdl, stream, one, zero); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/forward/batch_size_aware_chwn_small_image.cpp b/dnn/src/cuda/local_share/forward/batch_size_aware_chwn_small_image.cpp new file mode 100644 index 00000000..632539ba --- /dev/null +++ b/dnn/src/cuda/local_share/forward/batch_size_aware_chwn_small_image.cpp @@ -0,0 +1,99 @@ +/** + * \file dnn/src/cuda/local_share/forward/batch_size_aware_chwn_small_image.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "./local_share_forward.cuh" +#include "src/cuda/local_share/opr_impl.h" + +#include +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareForwardImpl::AlgoCHWNBatchSizeAwareSmallImage::is_available( + const SizeArgs& args) const { + using Param = LocalShare::Param; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + auto&& param = args.opr->param(); + auto format = param.format; + auto sparse = param.sparse; + auto mode = param.mode; + bool available = true; + // format must be nchw + available &= (format == Format::NCHW); + // only support dense conv + available &= (sparse == Sparse::DENSE); + // mode must be cross correlation + available &= (mode == Mode::CROSS_CORRELATION); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + available &= (ho % sgh == 0 && wo % sgw == 0); + // not support dilated convolution + available &= (dh == 1 && dw == 1); + available &= (ci % 4 == 0); + auto src_dtype = args.src_layout.dtype, + filter_dtype = args.filter_layout.dtype, + dst_dtype = args.dst_layout.dtype; + // only support float32 + available &= (src_dtype == filter_dtype && src_dtype == dst_dtype && + src_dtype == dtype::Float32()); + // only support sm_60 or later + available &= is_compute_capability_required(6, 0); + + return available; +} + +WorkspaceBundle +LocalShareForwardImpl::AlgoCHWNBatchSizeAwareSmallImage::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + size_t ws_size_src = n * ci * hi * wi * args.src_layout.dtype.size(); + size_t ws_size_dst = n * co * ho * wo * args.dst_layout.dtype.size(); + WorkspaceBundle ws{raw_ptr, {ws_size_src, ws_size_dst}}; + return ws; +} + +size_t +LocalShareForwardImpl::AlgoCHWNBatchSizeAwareSmallImage::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void LocalShareForwardImpl::AlgoCHWNBatchSizeAwareSmallImage::exec( + const ExecArgs& args) const { + local_share::Param kern_param; + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + auto&& handle = concrete_handle(args.opr->handle()); + auto&& cublas_hdl = cublas_handle(args.opr->handle()); + auto&& stream = cuda_stream(args.opr->handle()); + + auto one = handle->one_device(); + auto zero = handle->zero_device(); + + local_share::_do_local_share_convolution_large_batch_size_small_image( + args.src_tensor->ptr(), + args.filter_tensor->ptr(), + args.dst_tensor->ptr(), + reinterpret_cast(args.workspace.raw_ptr), fh, fw, sh, sw, + kern_param, cublas_hdl, stream, one, zero); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/forward/batched_matmul.cpp b/dnn/src/cuda/local_share/forward/batched_matmul.cpp new file mode 100644 index 00000000..af08ec39 --- /dev/null +++ b/dnn/src/cuda/local_share/forward/batched_matmul.cpp @@ -0,0 +1,133 @@ +/** + * \file dnn/src/cuda/local_share/forward/batched_matmul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./algo.h" +#include "src/cuda/local_share/im2col.cuh" +#include "src/cuda/local_share/opr_impl.h" + +using namespace megdnn; +using namespace cuda; + +bool LocalShareForwardImpl::AlgoBatchedMatMul::is_available( + const SizeArgs& args) const { + bool available = true; + auto&& param = args.opr->param(); + using Param = LocalShare::Param; + using Format = Param::Format; + // NCHW format + available &= param.format == Format::NCHW; + // only support float + auto src_dtype = args.src_layout.dtype, + filter_dtype = args.filter_layout.dtype, + dst_dtype = args.dst_layout.dtype; + available &= (src_dtype == filter_dtype) && (src_dtype == dst_dtype) && + (src_dtype == dtype::Float32()); + // do not support dilate conv + size_t dh = param.dilate_h, dw = param.dilate_w; + available &= (dh == 1 && dw == 1); + return available; +} + +WorkspaceBundle LocalShareForwardImpl::AlgoBatchedMatMul::get_workspace_bundle( + dt_byte* raw_ptr, const SizeArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + size_t groups = 1; + if (param.sparse == Sparse::GROUP) { + groups = args.filter_layout.shape[0]; + } + size_t icpg = ci / groups, ocpg = co / groups; + size_t ws_im2col = + n * ci * ho * wo * fh * fw * args.src_layout.dtype.size(); + size_t ws_posttranspose = n * co * ho * wo * args.dst_layout.dtype.size(); + auto&& matmul_opr = args.opr->handle()->create_operator(); + TensorLayout A{ + {groups * sgh * sgw, ho / sgh * wo / sgw * n, icpg * fh * fw}, + dtype::Float32()}; + TensorLayout B{{groups * sgh * sgw, icpg * fh * fw, ocpg}, + dtype::Float32()}; + TensorLayout C{{groups * sgh * sgw, ho / sgh * wo / sgw * n, ocpg}, + dtype::Float32()}; + size_t ws_matmul = matmul_opr->get_workspace_in_bytes(A, B, C); + WorkspaceBundle ws{raw_ptr, {ws_im2col, ws_matmul, ws_posttranspose}}; + return ws; +} + +size_t LocalShareForwardImpl::AlgoBatchedMatMul::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void LocalShareForwardImpl::AlgoBatchedMatMul::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + unpack_local_share_params(args.src_layout, args.filter_layout, + args.dst_layout, param); + using Param = LocalShare::Param; + using Sparse = Param::Sparse; + size_t groups = 1; + if (param.sparse == Sparse::GROUP) { + groups = args.filter_layout.shape[0]; + } + size_t icpg = ci / groups, ocpg = co / groups; + local_share::Param kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ph = ph, + kern_param.pw = pw, kern_param.grp_ho = ho / sgh, + kern_param.grp_wo = wo / sgw, kern_param.sgh = sgh, kern_param.sgw = sgw; + + auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ws_im2col = ws.get(0); + auto ws_matmul = ws.get(1); + auto ws_posttranspose = ws.get(2); + auto&& stream = cuda_stream(args.opr->handle()); + local_share::_do_local_share_im2col( + args.src_tensor->ptr(), + reinterpret_cast(ws_im2col), fh, fw, sh, sw, groups, + kern_param, stream); + + auto&& matmul_opr = args.opr->handle()->create_operator(); + TensorLayout A{ + {groups * sgh * sgw, ho / sgh * wo / sgw * n, icpg * fh * fw}, + dtype::Float32()}; + TensorLayout B{{groups * sgh * sgw, icpg * fh * fw, ocpg}, + dtype::Float32()}; + TensorLayout C{{groups * sgh * sgw, ho / sgh * wo / sgw * n, ocpg}, + dtype::Float32()}; + TensorND ts_A{ws_im2col, A}; + TensorND ts_B{args.filter_tensor->raw_ptr, B}; + TensorND ts_C{ws_posttranspose, C}; + Workspace ws_wrapper; + ws_wrapper.raw_ptr = reinterpret_cast(ws_matmul); + ws_wrapper.size = ws.get_size(1); + matmul_opr->exec(ts_A, ts_B, ts_C, ws_wrapper); + + { + TensorLayout C1{{n, groups, ocpg, sgh, ho / sgh, sgw, wo / sgw}, + dtype::Float32()}; + C1.stride[0] = ho / sgh * wo / sgw * ocpg; + C1.stride[1] = n * ho * wo * ocpg; + C1.stride[2] = 1; + C1.stride[3] = n * ho / sgh * wo * ocpg; + C1.stride[4] = wo / sgw * ocpg; + C1.stride[5] = n * ho / sgh * wo / sgw * ocpg; + C1.stride[6] = ocpg; + TensorLayout C2 = args.dst_layout; + TensorND ts_C1{ws_posttranspose, C1}; + TensorND ts_C2{args.dst_tensor->raw_ptr, C2}; + auto&& relayout_opr = args.opr->handle()->create_operator(); + relayout_opr->exec(ts_C1, ts_C2); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/forward/local_share_forward.cuh b/dnn/src/cuda/local_share/forward/local_share_forward.cuh new file mode 100644 index 00000000..5beec848 --- /dev/null +++ b/dnn/src/cuda/local_share/forward/local_share_forward.cuh @@ -0,0 +1,33 @@ +/** + * \file dnn/src/cuda/local_share/forward/local_share_forward.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local_share/helper.cuh" + +namespace megdnn { +namespace cuda { +namespace local_share { + +void _do_local_share_convolution_large_batch_size( + const float* d_src, const float* d_filter, float* d_dst, + float* workspace, int fh, int fw, int sh, int sw, const Param& param, + cublasHandle_t cublas_handle, cudaStream_t stream, float* one, + float* zero); + +void _do_local_share_convolution_large_batch_size_small_image( + const float* d_src, const float* d_filter, float* d_dst, + float* workspace, int fh, int fw, int sh, int sw, const Param& param, + cublasHandle_t cublas_handle, cudaStream_t stream, float* one, + float* zero); + +} // namespace local_share +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu new file mode 100644 index 00000000..041765c8 --- /dev/null +++ b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu @@ -0,0 +1,1308 @@ +/** + * \file dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./local_share_forward.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace local_share; + +namespace { +template +struct UnrollConfig { + static int const unroll_co = unroll_co_; + static int const unroll_ci = unroll_ci_; + static int const unroll_wo = unroll_wo_; +}; + +template +struct ThreadConfig { + static int const nr_thread_x = thread_x; + static int const nr_thread_y = thread_y; +}; + +template +struct DataTileCount { + static int const tile_hi = LocalShareConfig::fh; + static int const tile_wi = UnrollConfig::unroll_wo * LocalShareConfig::sw + + LocalShareConfig::fw - LocalShareConfig::sw; + static int const tile_hw = tile_hi * tile_wi; + static int const tile_chw = UnrollConfig::unroll_ci * tile_hi * tile_wi; + static int const reg_gl2sh = (tile_chw + ThreadConfig::nr_thread_y - 1) / + ThreadConfig::nr_thread_y; + static int const smem_h = tile_chw; + static int const smem_w = ThreadConfig::nr_thread_x; + static int const smem_stride = smem_w; + static int const smem_tot = smem_h * smem_stride; +}; + +template +struct FilterTileCount { + static int const tile_co = + ThreadConfig::nr_thread_y * UnrollConfig::unroll_co; + static int const tile_ci = UnrollConfig::unroll_ci; + static int const smem_h = + tile_ci * LocalShareConfig::fh * LocalShareConfig::fw; + static int const smem_w = tile_co; + static int const smem_stride = smem_w + 1; + static int const smem_tot = smem_h * smem_stride; + + MEGDNN_STATIC_ASSERT(smem_w % ThreadConfig::nr_thread_x == 0, + "col of share memory must be divided by nr_thread_x"); + static int const reg_h = (smem_h + ThreadConfig::nr_thread_y - 1) / + ThreadConfig::nr_thread_y; + static int const reg_w = smem_w / ThreadConfig::nr_thread_x; +}; + +template +struct DataGlobal2ShareMemVisitor { + typedef float copy_t; + typedef DataTileCount + DataTileCount; + float* smem; + const float* g_ptr; + int c_stride; + int h_stride; + int w_stride; + int h1, h2; + int w1, w2; + const int tid_x = threadIdx.x; + const int tid_y = threadIdx.y; + + copy_t reg[DataTileCount::reg_gl2sh]; + + __device__ DataGlobal2ShareMemVisitor(float* smem, const float* g_ptr, + int c_stride, int h_stride, + int w_stride, int h1, int h2, int w1, + int w2) + : smem{smem}, + g_ptr{g_ptr}, + c_stride{c_stride}, + h_stride{h_stride}, + w_stride{w_stride}, + h1{h1}, + h2{h2}, + w1{w1}, + w2{w2} {}; + + __device__ __forceinline__ void first_copy() { + int chw = tid_y; +#pragma unroll + for (int i = 0; i < DataTileCount::reg_gl2sh; ++i) { + if (chw < DataTileCount::tile_chw) { + int ic = chw / DataTileCount::tile_hw; + int hw = chw - ic * DataTileCount::tile_hw; + int ih = hw / DataTileCount::tile_wi; + int iw = hw - ih * DataTileCount::tile_wi; + copy_t val = 0.f; + if (ih >= h1 && ih < h2 && iw >= w1 && iw < w2) { + val = g_ptr[ic * c_stride + ih * h_stride + iw * w_stride]; + } + *(sh_ptr(chw, tid_x)) = val; + } + chw += ThreadConfig::nr_thread_y; + } + } + + __device__ __forceinline__ void copy() { + int chw = tid_y; +#pragma unroll + for (int i = 0; i < DataTileCount::reg_gl2sh; ++i) { + if (chw < DataTileCount::tile_chw) { + int ic = chw / DataTileCount::tile_hw; + int hw = chw - ic * DataTileCount::tile_hw; + int ih = hw / DataTileCount::tile_wi; + int iw = hw - ih * DataTileCount::tile_wi; + copy_t val = 0.f; + if (ih >= h1 && ih < h2 && iw >= w1 && iw < w2) { + val = g_ptr[ic * c_stride + ih * h_stride + iw * w_stride]; + } + reg[i] = val; + } + chw += ThreadConfig::nr_thread_y; + } + } + + __device__ __forceinline__ void commit() { + int chw = tid_y; +#pragma unroll + for (int i = 0; i < DataTileCount::reg_gl2sh; ++i) { + if (chw < DataTileCount::tile_chw) + *(sh_ptr(chw, tid_x)) = reg[i]; + chw += ThreadConfig::nr_thread_y; + } + }; + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * DataTileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_ci * c_stride; + }; +}; + +template +struct FilterGlobal2ShareMemVisitor { + typedef float copy_t; + typedef FilterTileCount + FilterTileCount; + float* smem; + const float* g_ptr; + int remain; + int stride; + const int tid_x = threadIdx.x; + const int tid_y = threadIdx.y; + + copy_t reg[FilterTileCount::reg_h][FilterTileCount::reg_w]; + + __device__ FilterGlobal2ShareMemVisitor(float* smem, const float* g_ptr, + int remain, int stride) + : smem{smem}, g_ptr{g_ptr}, remain{remain}, stride{stride} {}; + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < FilterTileCount::reg_h; ++i) { + int h_idx = tid_y + i * ThreadConfig::nr_thread_y; +#pragma unroll + for (int j = 0; j < FilterTileCount::reg_w; ++j) { + int w_idx = tid_x + j * ThreadConfig::nr_thread_x; + if (h_idx < FilterTileCount::smem_h) { + float val = 0.f; + if (w_idx < remain) + val = g_ptr[h_idx * stride + w_idx]; + *(sh_ptr(h_idx, w_idx)) = val; + } + } + } + } + + __device__ __forceinline__ void copy() { + // TODO: co bound check +#pragma unroll + for (int i = 0; i < FilterTileCount::reg_h; ++i) { + int h_idx = tid_y + i * ThreadConfig::nr_thread_y; +#pragma unroll + for (int j = 0; j < FilterTileCount::reg_w; ++j) { + int w_idx = tid_x + j * ThreadConfig::nr_thread_x; + if (h_idx < FilterTileCount::smem_h) { + float val = 0.f; + if (w_idx < remain) + val = g_ptr[h_idx * stride + w_idx]; + reg[i][j] = val; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < FilterTileCount::reg_h; ++i) { + int h_idx = tid_y + i * ThreadConfig::nr_thread_y; + +#pragma unroll + for (int j = 0; j < FilterTileCount::reg_w; ++j) { + int w_idx = tid_x + j * ThreadConfig::nr_thread_x; + if (h_idx < FilterTileCount::smem_h) + *(sh_ptr(h_idx, w_idx)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * FilterTileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_ci * LocalShareConfig::fh * + LocalShareConfig::fw * stride; + } +}; + +template +__device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor& src_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& filter_gl2sh_visitor, + float r_src[DataTileCount::tile_wi], + float r_filter[UnrollConfig::unroll_co][LocalShareConfig::fw], + float r_acc[UnrollConfig::unroll_co][UnrollConfig::unroll_wo]) { + typedef DataTileCount + DataTileCount; + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + for (int ci_inner = 0; ci_inner < UnrollConfig::unroll_ci; ++ci_inner) { + int sh_flt_row_base = + ci_inner * LocalShareConfig::fh * LocalShareConfig::fw; + int sh_flt_col_base = tidy * UnrollConfig::unroll_co; + int sh_src_row_base = ci_inner * DataTileCount::tile_hw; +#pragma unroll + for (int kh = 0; kh < LocalShareConfig::fh; ++kh) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < LocalShareConfig::fw; ++j) { + r_filter[i][j] = *(filter_gl2sh_visitor.sh_ptr( + sh_flt_row_base + kh * LocalShareConfig::fw + j, + sh_flt_col_base + i)); + } + } +#pragma unroll + for (int i = 0; i < DataTileCount::tile_wi; ++i) { + int sh_src_row = kh * DataTileCount::tile_wi + i; + r_src[i] = *(src_gl2sh_visitor.sh_ptr( + sh_src_row_base + sh_src_row, tidx)); + } +#pragma unroll + for (int kw = 0; kw < LocalShareConfig::fw; ++kw) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { + r_acc[i][j] += r_src[j * LocalShareConfig::sw + kw] * + r_filter[i][kw]; + } + } + } + } + } +} + +/* + * Src tensor format is (c, h, w, n), filter tensor format is (sgh, sgw, co, ci, + * fh, fw), and dst tensor format (c, h, w, n). Thread block size is (32, BY). + * Each thread compute 1 x UnrollConfig::unroll_wo entries + * of one slice with height ho and width wo of the output tensor. Each block + * compute 32 batches and BY x UnrollConfig::unroll_co output channels. + */ +template +__global__ void local_share_device_template_f32( + const float* __restrict__ src, const float* __restrict__ filter, + float* __restrict__ dst, Param param) { + typedef DataTileCount + DataTileCount; + typedef FilterTileCount + FilterTileCount; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + const int blks_per_grp_wo = (param.grp_wo + UnrollConfig::unroll_wo - 1) / + UnrollConfig::unroll_wo; + const int b_co = bidy / param.grp_ho; + const int b_grp_ho = bidy - b_co * param.grp_ho; + const int b_n = bidx / blks_per_grp_wo; + const int b_grp_wo = bidx - b_n * blks_per_grp_wo; + + const int b_sgh = bidz / param.sgw; + const int b_sgw = bidz - b_sgh * param.sgw; + + const int b_ho = b_sgh * param.grp_ho + b_grp_ho; + const int b_wo = b_sgw * param.grp_wo + b_grp_wo * UnrollConfig::unroll_wo; + + const int b_hi = b_ho * LocalShareConfig::sh - param.ph; + const int b_wi = b_wo * LocalShareConfig::sw - param.pw; + + const int ho = param.sgh * param.grp_ho; + const int wo = param.sgw * param.grp_wo; + const int t_co = + b_co * FilterTileCount::tile_co + tidy * UnrollConfig::unroll_co; + + const float* __restrict__ g_ptr_src = + src + (b_hi * param.wi + b_wi) * param.n + + b_n * ThreadConfig::nr_thread_x + tidx; + const float* __restrict__ g_ptr_filter = + filter + + (b_sgh * param.sgw + b_sgw) * param.co * param.ci * + LocalShareConfig::fh * + LocalShareConfig::fw // spatial group + + b_co; // output channel + float* __restrict__ g_ptr_dst = dst + t_co * ho * wo * param.n + + (b_ho * wo + b_wo) * param.n + + b_n * ThreadConfig::nr_thread_x + tidx; + + extern __shared__ float smem[]; + + float* sh_src = smem; + float* sh_filter = smem + DataTileCount::smem_tot; + + // TODO check register + DataGlobal2ShareMemVisitor + src_gl2sh_visitor{sh_src, + g_ptr_src, + param.hi * param.wi * param.n, + param.wi * param.n, + param.n, + -b_hi, + param.hi - b_hi, + -b_wi, + param.wi - b_wi}; + FilterGlobal2ShareMemVisitor + filter_gl2sh_visitor{sh_filter, g_ptr_filter, param.co - b_co, + param.co}; + + float r_src[DataTileCount::tile_wi]; + float r_filter[UnrollConfig::unroll_co][LocalShareConfig::fw]; + float r_acc[UnrollConfig::unroll_co][UnrollConfig::unroll_wo]; + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { + r_acc[i][j] = 0; + } + } + + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + int ci_blks = + (param.ci + UnrollConfig::unroll_ci - 1) / UnrollConfig::unroll_ci; + + for (int ci_outer = 0; ci_outer < ci_blks - 1; ci_outer++) { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + + consume_block( + src_gl2sh_visitor, filter_gl2sh_visitor, r_src, r_filter, + r_acc); + + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + + consume_block( + src_gl2sh_visitor, filter_gl2sh_visitor, r_src, r_filter, r_acc); + + const int co_stride = ho * wo * param.n; + const int t_grp_wo_base = b_grp_wo * UnrollConfig::unroll_wo; +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { + int g_co = t_co + i; + int t_grp_wo = t_grp_wo_base + j; + if (g_co < param.co && t_grp_wo < param.grp_wo) { + g_ptr_dst[i * co_stride + j * param.n] = r_acc[i][j]; + } + } + } +} + +void (*get_kern(int fh, int fw, int sh, int sw, const Param& param, + LaunchConfig& launch_config))(const float* __restrict__, + const float* __restrict__, + float* __restrict__, Param) { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param); + kern = nullptr; + if (fh == 1 && fw == 1 && sh == 1 && sw == 1) { + static constexpr int fh_ = 1; + static constexpr int fw_ = 1; + static constexpr int sh_ = 1; + static constexpr int sw_ = 1; +#define CK_GRP_WO(_grp_wo) \ + if (param.grp_wo >= _grp_wo) { \ + static constexpr int unroll_co = 8; \ + static constexpr int unroll_ci = 4; \ + static constexpr int unroll_wo = _grp_wo; \ + static constexpr int nr_thread_x = 32; \ + static constexpr int nr_thread_y = 8; \ + typedef LocalShareConfig LocalShareConfig_; \ + typedef UnrollConfig UnrollConfig_; \ + typedef ThreadConfig ThreadConfig_; \ + kern = local_share_device_template_f32; \ + launch_config.nr_threads_x = nr_thread_x; \ + launch_config.nr_threads_y = nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + DIVUP(param.n, nr_thread_x) * DIVUP(param.grp_wo, unroll_wo); \ + launch_config.nr_blocks_y = \ + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; \ + launch_config.nr_blocks_z = param.sgh * param.sgw; \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + DataTileCount::smem_tot + \ + sizeof(float) * \ + FilterTileCount::smem_tot; \ + } + CK_GRP_WO(1); + CK_GRP_WO(2); + CK_GRP_WO(3); + CK_GRP_WO(4); +#undef CK_GRP_WO + } else if (fh == 1 && fw == 1 && sh == 2 && sw == 2) { + static constexpr int fh_ = 1; + static constexpr int fw_ = 1; + static constexpr int sh_ = 2; + static constexpr int sw_ = 2; +#define CK_GRP_WO(_grp_wo) \ + if (param.grp_wo >= _grp_wo) { \ + static constexpr int unroll_co = 8; \ + static constexpr int unroll_ci = 4; \ + static constexpr int unroll_wo = _grp_wo; \ + static constexpr int nr_thread_x = 32; \ + static constexpr int nr_thread_y = 8; \ + typedef LocalShareConfig LocalShareConfig_; \ + typedef UnrollConfig UnrollConfig_; \ + typedef ThreadConfig ThreadConfig_; \ + kern = local_share_device_template_f32; \ + launch_config.nr_threads_x = nr_thread_x; \ + launch_config.nr_threads_y = nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + DIVUP(param.n, nr_thread_x) * DIVUP(param.grp_wo, unroll_wo); \ + launch_config.nr_blocks_y = \ + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; \ + launch_config.nr_blocks_z = param.sgh * param.sgw; \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + DataTileCount::smem_tot + \ + sizeof(float) * \ + FilterTileCount::smem_tot; \ + } + CK_GRP_WO(1); + CK_GRP_WO(2); + CK_GRP_WO(3); + CK_GRP_WO(4); + CK_GRP_WO(5); + CK_GRP_WO(6); + CK_GRP_WO(7); + CK_GRP_WO(8); +#undef CK_GRP_WO + } else if (fh == 3 && fw == 3 && sh == 1 && sw == 1) { + static constexpr int fh_ = 3; + static constexpr int fw_ = 3; + static constexpr int sh_ = 1; + static constexpr int sw_ = 1; +#define CK_GRP_WO(_grp_wo) \ + if (param.grp_wo >= _grp_wo) { \ + static constexpr int unroll_co = 4; \ + static constexpr int unroll_ci = 1; \ + static constexpr int unroll_wo = _grp_wo; \ + static constexpr int nr_thread_x = 32; \ + static constexpr int nr_thread_y = 8; \ + typedef LocalShareConfig LocalShareConfig_; \ + typedef UnrollConfig UnrollConfig_; \ + typedef ThreadConfig ThreadConfig_; \ + kern = local_share_device_template_f32; \ + launch_config.nr_threads_x = nr_thread_x; \ + launch_config.nr_threads_y = nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + DIVUP(param.n, nr_thread_x) * DIVUP(param.grp_wo, unroll_wo); \ + launch_config.nr_blocks_y = \ + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; \ + launch_config.nr_blocks_z = param.sgh * param.sgw; \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + DataTileCount::smem_tot + \ + sizeof(float) * \ + FilterTileCount::smem_tot; \ + } + CK_GRP_WO(1); + CK_GRP_WO(2); + CK_GRP_WO(3); + CK_GRP_WO(4); + CK_GRP_WO(5); + CK_GRP_WO(6); + CK_GRP_WO(7); + CK_GRP_WO(8); +#undef CK_GRP_WO + } else if (fh == 3 && fw == 3 && sh == 2 && sw == 2) { + static constexpr int fh_ = 3; + static constexpr int fw_ = 3; + static constexpr int sh_ = 2; + static constexpr int sw_ = 2; +#define CK_GRP_WO(_grp_wo) \ + if (param.grp_wo >= _grp_wo) { \ + static constexpr int unroll_co = 8; \ + static constexpr int unroll_ci = 1; \ + static constexpr int unroll_wo = _grp_wo; \ + static constexpr int nr_thread_x = 32; \ + static constexpr int nr_thread_y = 4; \ + typedef LocalShareConfig LocalShareConfig_; \ + typedef UnrollConfig UnrollConfig_; \ + typedef ThreadConfig ThreadConfig_; \ + kern = local_share_device_template_f32; \ + launch_config.nr_threads_x = nr_thread_x; \ + launch_config.nr_threads_y = nr_thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + DIVUP(param.n, nr_thread_x) * DIVUP(param.grp_wo, unroll_wo); \ + launch_config.nr_blocks_y = \ + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; \ + launch_config.nr_blocks_z = param.sgh * param.sgw; \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + DataTileCount::smem_tot + \ + sizeof(float) * \ + FilterTileCount::smem_tot; \ + } + CK_GRP_WO(1); + CK_GRP_WO(2); + CK_GRP_WO(3); + CK_GRP_WO(4); + CK_GRP_WO(5); + CK_GRP_WO(6); + CK_GRP_WO(7); + CK_GRP_WO(8); +#undef CK_GRP_WO + //! TODO: tune performance for kern size = (5x5, and 7x7) + } else if (fh == 5 && fw == 5 && sh == 1 && sw == 1) { + static constexpr int fh_ = 5; + static constexpr int fw_ = 5; + static constexpr int sh_ = 1; + static constexpr int sw_ = 1; + if (param.grp_wo >= 8) { + static constexpr int unroll_co = 8; + static constexpr int unroll_ci = 2; + static constexpr int unroll_wo = 8; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + + } else if (param.grp_wo >= 4) { + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 2; + static constexpr int unroll_wo = 4; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + + } else { + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 2; + static constexpr int unroll_wo = 2; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + } + } else if (fh == 5 && fw == 5 && sh == 2 && sw == 2) { + static constexpr int fh_ = 5; + static constexpr int fw_ = 5; + static constexpr int sh_ = 2; + static constexpr int sw_ = 2; + if (param.grp_wo >= 4) { + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 2; + static constexpr int unroll_wo = 4; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + } else { + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 2; + static constexpr int unroll_wo = 2; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + } + } else if (fh == 7 && fw == 7 && sh == 1 && sw == 1) { + static constexpr int fh_ = 7; + static constexpr int fw_ = 7; + static constexpr int sh_ = 1; + static constexpr int sw_ = 1; + if (param.grp_wo >= 8) { + static constexpr int unroll_co = 8; + static constexpr int unroll_ci = 1; + static constexpr int unroll_wo = 8; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + + } else if (param.grp_wo >= 4) { + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 1; + static constexpr int unroll_wo = 4; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + + } else { + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 1; + static constexpr int unroll_wo = 2; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32< + LocalShareConfig_, UnrollConfig_, ThreadConfig_>; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = DIVUP(param.n, nr_thread_x) * + DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * + DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + } + } else if (fh == 7 && fw == 7 && sh == 2 && sw == 2) { + static constexpr int fh_ = 7; + static constexpr int fw_ = 7; + static constexpr int sh_ = 2; + static constexpr int sw_ = 2; + static constexpr int unroll_co = 16; + static constexpr int unroll_ci = 1; + static constexpr int unroll_wo = 2; + static constexpr int nr_thread_x = 32; + static constexpr int nr_thread_y = 8; + typedef LocalShareConfig LocalShareConfig_; + typedef UnrollConfig UnrollConfig_; + typedef ThreadConfig ThreadConfig_; + kern = local_share_device_template_f32; + launch_config.nr_threads_x = nr_thread_x; + launch_config.nr_threads_y = nr_thread_y; + launch_config.nr_threads_z = 1; + launch_config.nr_blocks_x = + DIVUP(param.n, nr_thread_x) * DIVUP(param.grp_wo, unroll_wo); + launch_config.nr_blocks_y = + DIVUP(param.co, nr_thread_y * unroll_co) * param.grp_ho; + launch_config.nr_blocks_z = param.sgh * param.sgw; + launch_config.smem_size_in_bytes = + sizeof(float) * DataTileCount::smem_tot + + sizeof(float) * + FilterTileCount::smem_tot; + } else { + megdnn_assert(false, + "no usable kernel implementation for local share " + "convolution (fh,fw)=(%d,%d), (sh,sw)=(%d,%d)", + fh, fw, sh, sw); + } + return kern; +} + +} // namespace + +//! this is a dummy kernel +#if 0 +namespace batch_size_aware { + +template +struct UnrollConfig { + static int const unroll_ho = unroll_ho_; + static int const unroll_wo = unroll_wo_; + static int const unroll_ci = unroll_ci_; +}; + +template +struct ThreadConfig { + static int const nr_thread_x = thread_x; + static int const nr_thread_y = thread_y; +}; + +template +struct DataTileCount { + static int const tile_hi = UnrollConfig::unroll_ho * LocalShareConfig::sh + + LocalShareConfig::fh - 1; + static int const tile_wi = UnrollConfig::unroll_wo * LocalShareConfig::sw + + LocalShareConfig::fw - 1; + static int const tile_hw = tile_hi * tile_wi; + static int const tile_chw = UnrollConfig::unroll_ci * tile_hi * tile_wi; + static int const reg_gl2sh = (tile_chw + ThreadConfig::nr_thread_y - 1) / + ThreadConfig::nr_thread_y; + static int const smem_h = tile_chw; + static int const smem_w = ThreadConfig::nr_thread_x; + static int const smem_stride = smem_w; + static int const smem_tot = smem_h * smem_stride; +}; + +template +struct FilterTileCount { + static int const tile_co = ThreadConfig::nr_thread_y; + static int const tile_ci = UnrollConfig::unroll_ci; + static int const smem_h = tile_co; + static int const smem_w = + tile_ci * LocalShareConfig::fh * LocalShareConfig::fw; + static int const smem_stride = smem_w; + static int const smem_tot = smem_h * smem_stride; + static int const reg_gl2sh = (smem_w + ThreadConfig::nr_thread_x - 1) / + ThreadConfig::nr_thread_x; +}; + +template +struct DataGlobal2ShareMemVisitor { + typedef float copy_t; + typedef DataTileCount + DataTileCount; + float* smem; + const float* g_ptr; + int c_stride; + int h_stride; + int w_stride; + int h1, h2; + int w1, w2; + const int tid_x = threadIdx.x; + const int tid_y = threadIdx.y; + + copy_t reg[DataTileCount::reg_gl2sh]; + + __device__ DataGlobal2ShareMemVisitor(float* smem, const float* g_ptr, + int c_stride, int h_stride, + int w_stride, int h1, int h2, int w1, + int w2) + : smem{smem}, + g_ptr{g_ptr}, + c_stride{c_stride}, + h_stride{h_stride}, + w_stride{w_stride}, + h1{h1}, + h2{h2}, + w1{w1}, + w2{w2} {}; + + __device__ __forceinline__ void first_copy() { + int chw = tid_y; +#pragma unroll + for (int i = 0; i < DataTileCount::reg_gl2sh; ++i) { + if (chw < DataTileCount::tile_chw) { + int ic = chw / DataTileCount::tile_hw; + int hw = chw - ic * DataTileCount::tile_hw; + int ih = hw / DataTileCount::tile_wi; + int iw = hw - ih * DataTileCount::tile_wi; + copy_t val = 0.f; + if (ih >= h1 && ih < h2 && iw >= w1 && iw < w2) { + val = g_ptr[ic * c_stride + ih * h_stride + iw * w_stride]; + } + *(sh_ptr(chw, tid_x)) = val; + } + chw += ThreadConfig::nr_thread_y; + } + } + + __device__ __forceinline__ void copy() { + int chw = tid_y; +#pragma unroll + for (int i = 0; i < DataTileCount::reg_gl2sh; ++i) { + if (chw < DataTileCount::tile_chw) { + int ic = chw / DataTileCount::tile_hw; + int hw = chw - ic * DataTileCount::tile_hw; + int ih = hw / DataTileCount::tile_wi; + int iw = hw - ih * DataTileCount::tile_wi; + copy_t val = 0.f; + if (ih >= h1 && ih < h2 && iw >= w1 && iw < w2) { + val = g_ptr[ic * c_stride + ih * h_stride + iw * w_stride]; + } + reg[i] = val; + } + chw += ThreadConfig::nr_thread_y; + } + } + + __device__ __forceinline__ void commit() { + int chw = tid_y; +#pragma unroll + for (int i = 0; i < DataTileCount::reg_gl2sh; ++i) { + if (chw < DataTileCount::tile_chw) + *(sh_ptr(chw, tid_x)) = reg[i]; + chw += ThreadConfig::nr_thread_y; + } + }; + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * DataTileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_ci * c_stride; + }; +}; + +template +struct FilterGlobal2ShareMemVisitor { + typedef float copy_t; + typedef FilterTileCount + FilterTileCount; + float* smem; + const float* g_ptr; + const int tid_x = threadIdx.x; + const int tid_y = threadIdx.y; + + copy_t reg[FilterTileCount::reg_gl2sh]; + + __device__ FilterGlobal2ShareMemVisitor(float* smem, const float* g_ptr) + : smem{smem}, g_ptr{g_ptr} {}; + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < FilterTileCount::reg_gl2sh; ++i) { + int idx = i * ThreadConfig::nr_thread_x; + if (idx < FilterTileCount::smem_w) + *(sh_ptr(tid_y, idx + tid_x)) = g_ptr[idx]; + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < FilterTileCount::reg_gl2sh; ++i) { + int idx = i * ThreadConfig::nr_thread_x; + if (idx < FilterTileCount::smem_w) + reg[i] = g_ptr[idx]; + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < FilterTileCount::reg_gl2sh; ++i) { + int idx = tid_x + i * ThreadConfig::nr_thread_x; + if (idx < FilterTileCount::smem_w) + *(sh_ptr(tid_y, idx)) = reg[i]; + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * FilterTileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_ci * LocalShareConfig::fh * + LocalShareConfig::fw; + } +}; + +/* + * Src tensor format is (c, h, w, n), filter tensor format is (sgh, sgw, co, ci, + * fh, fw), and dst tensor format (c, h, w, n). Thread block size is (32, BY). + * Each thread compute UnrollConfig::unroll_ho x UnrollConfig::unroll_wo entries + * of one slice with height ho and width wo of the output tensor. Each block + * compute 32 batches and BY output channels. + */ +template +__global__ void local_share_device_template_f32( + const float* __restrict__ src, const float* __restrict__ filter, + float* __restrict__ dst, Param param) { + typedef DataTileCount + DataTileCount; + typedef FilterTileCount + FilterTileCount; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + const int blks_per_grp_ho = (param.grp_ho + UnrollConfig::unroll_ho - 1) / + UnrollConfig::unroll_ho; + const int blks_per_grp_wo = (param.grp_wo + UnrollConfig::unroll_wo - 1) / + UnrollConfig::unroll_wo; + const int b_co = bidy / blks_per_grp_ho; + const int b_grp_ho = bidy - b_co * blks_per_grp_ho; + const int b_n = bidx / blks_per_grp_wo; + const int b_grp_wo = bidx - b_n * blks_per_grp_wo; + + const int b_sgh = bidz / param.sgw; + const int b_sgw = bidz - b_sgh * param.sgw; + + const int b_ho = b_sgh * param.grp_ho + b_grp_ho * UnrollConfig::unroll_ho; + const int b_wo = b_sgw * param.grp_wo + b_grp_wo * UnrollConfig::unroll_wo; + + const int b_hi = b_ho * LocalShareConfig::sh - param.ph; + const int b_wi = b_wo * LocalShareConfig::sw - param.pw; + + const int ho = param.sgh * param.grp_ho; + const int wo = param.sgw * param.grp_wo; + const int t_co = b_co * ThreadConfig::nr_thread_y + tidy; + + const float* __restrict__ g_ptr_src = + src + (b_hi * param.wi + b_wi) * param.n + + b_n * ThreadConfig::nr_thread_x + tidx; + const float* __restrict__ g_ptr_filter = + filter + + (b_sgh * param.sgw + b_sgw) * param.co * param.ci * + LocalShareConfig::fh * + LocalShareConfig::fw // spatial group + + t_co * param.ci * LocalShareConfig::fh * + LocalShareConfig::fw // output channel + + tidx; + float* __restrict__ g_ptr_dst = dst + t_co * ho * wo * param.n + + (b_ho * wo + b_wo) * param.n + + b_n * ThreadConfig::nr_thread_x + tidx; + + extern __shared__ float smem[]; + + float* sh_src = smem; + float* sh_filter = smem + DataTileCount::smem_tot; + + // TODO check register + DataGlobal2ShareMemVisitor + src_gl2sh_visitor{sh_src, + g_ptr_src, + param.hi * param.wi * param.n, + param.wi * param.n, + param.n, + -b_hi, + param.hi - b_hi, + -b_wi, + param.wi - b_wi}; + FilterGlobal2ShareMemVisitor + filter_gl2sh_visitor{sh_filter, g_ptr_filter}; + + float r_src[UnrollConfig::unroll_ho][DataTileCount::tile_wi]; + float r_filter[LocalShareConfig::fw]; + float r_acc[UnrollConfig::unroll_ho][UnrollConfig::unroll_wo]; + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ho; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { + r_acc[i][j] = 0; + } + } + + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + int ci_blks = + (param.ci + UnrollConfig::unroll_ci - 1) / UnrollConfig::unroll_ci; + +#pragma unroll + for (int ci_outer = 0; ci_outer < ci_blks - 1; ci_outer++) { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + + for (int ci_inner = 0; ci_inner < UnrollConfig::unroll_ci; ++ci_inner) { + int sh_flt_col_base = + ci_inner * LocalShareConfig::fh * LocalShareConfig::fw; + int sh_src_row_base = ci_inner * DataTileCount::tile_hw; +#pragma unroll + for (int kh = 0; kh < LocalShareConfig::fh; ++kh) { +#pragma unroll + for (int i = 0; i < LocalShareConfig::fw; ++i) { + r_filter[i] = *(filter_gl2sh_visitor.sh_ptr( + tidy, + sh_flt_col_base + kh * LocalShareConfig::fw + i)); + } +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ho; ++i) { +#pragma unroll + for (int j = 0; j < DataTileCount::tile_wi; ++j) { + int sh_src_row = (i * LocalShareConfig::sh + kh) * + DataTileCount::tile_wi + + j; + r_src[i][j] = *(src_gl2sh_visitor.sh_ptr( + sh_src_row_base + sh_src_row, tidx)); + } + } +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ho; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { +#pragma unroll + for (int kw = 0; kw < LocalShareConfig::fw; ++kw) { + r_acc[i][j] += + r_src[i][j * LocalShareConfig::sw + kw] * + r_filter[kw]; + } + } + } + } + } + + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + + for (int ci_inner = 0; ci_inner < UnrollConfig::unroll_ci; ++ci_inner) { + int sh_flt_col_base = + ci_inner * LocalShareConfig::fh * LocalShareConfig::fw; + int sh_src_row_base = ci_inner * DataTileCount::tile_hw; +#pragma unroll + for (int kh = 0; kh < LocalShareConfig::fh; ++kh) { +#pragma unroll + for (int i = 0; i < LocalShareConfig::fw; ++i) { + r_filter[i] = *(filter_gl2sh_visitor.sh_ptr( + tidy, + sh_flt_col_base + kh * LocalShareConfig::fw + i)); + } +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ho; ++i) { +#pragma unroll + for (int j = 0; j < DataTileCount::tile_wi; ++j) { + int sh_src_row = (i * LocalShareConfig::sh + kh) * + DataTileCount::tile_wi + + j; + r_src[i][j] = *(src_gl2sh_visitor.sh_ptr( + sh_src_row_base + sh_src_row, tidx)); + } + } +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ho; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { +#pragma unroll + for (int kw = 0; kw < LocalShareConfig::fw; ++kw) { + r_acc[i][j] += + r_src[i][j * LocalShareConfig::sw + kw] * + r_filter[kw]; + } + } + } + } + } + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_ho; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_wo; ++j) { + int oh = b_ho + i; + int ow = b_wo + j; + if (t_co < param.co && oh < ho && ow < wo) { + g_ptr_dst[(i * wo + j) * param.n] = r_acc[i][j]; + } + } + } +} + +} // namespace batch_size_aware +#endif + +void megdnn::cuda::local_share::_do_local_share_convolution_large_batch_size( + const float* d_src, const float* d_filter, float* d_dst, + float* workspace, int fh, int fw, int sh, int sw, const Param& param, + cublasHandle_t cublas_handle, cudaStream_t stream, float* one, + float* zero) { + float* ws_src = workspace; + int nr_elem_total = param.n * param.ci * param.hi * param.wi; + float* ws_dst = workspace + nr_elem_total; + // tensor reformat from (n, c, h, w) -> (c, h, w, n) + { + int m = param.n, n = param.ci * param.hi * param.wi; + int lda, ldb; + lda = ldb = param.ci * param.hi * param.wi; + int ldc = param.n; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, d_src, lda, zero, d_src, ldb, ws_src, + ldc)); + } + + { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param); + LaunchConfig launch_config; + kern = get_kern(fh, fw, sh, sw, param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + _check_launch_config(launch_config); + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + kern<<>>( + ws_src, d_filter, ws_dst, param); + after_kernel_launch(); + } + + // tensor reformat form (c, h, w, n) -> (n, c, h, w) + { + int ho = param.grp_ho * param.sgh, wo = param.grp_wo * param.sgw; + int m = param.co * ho * wo, n = param.n; + int lda, ldb; + lda = ldb = param.n; + int ldc = param.co * ho * wo; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, ws_dst, lda, zero, ws_dst, ldb, d_dst, + ldc)); + } +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu new file mode 100644 index 00000000..7b05f9f5 --- /dev/null +++ b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu @@ -0,0 +1,599 @@ +/** + * \file dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./local_share_forward.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace local_share; + +namespace { +template +struct UnrollConfig { + static int const unroll_ci = unroll_ci_; + static int const unroll_co = unroll_co_; + static int const unroll_n = unroll_n_; +}; + +template +struct ThreadConfig { + static int const nr_thread_x = thread_x; + static int const nr_thread_y = thread_y; + static int const nr_threads = nr_thread_x * nr_thread_y; +}; + +template +struct DataTileCount { + typedef UnrollConfig_ UnrollConfig; + typedef ThreadConfig_ ThreadConfig; + static int const tile_batch = + UnrollConfig::unroll_n * ThreadConfig::nr_thread_x; + + static int const load_x = tile_batch > 32 ? 32 : tile_batch; + static int const load_y = ThreadConfig::nr_threads / load_x; + + static int const smem_h = UnrollConfig::unroll_ci; + static int const smem_w = tile_batch; + static int const smem_stride = smem_w; + static int const smem_tot = smem_h * smem_stride; + + static int const reg_row = (smem_h + load_y - 1) / load_y; + static int const reg_col = (smem_w + load_x - 1) / load_x; + static bool const check_sh_bounds = smem_w % load_x != 0; +}; + +template +struct FilterTileCount { + typedef UnrollConfig_ UnrollConfig; + typedef ThreadConfig_ ThreadConfig; + static int const tile_co = + ThreadConfig::nr_thread_y * UnrollConfig::unroll_co; + static int const smem_h = UnrollConfig::unroll_ci; + static int const smem_w = tile_co; + static int const smem_stride = smem_w + 1; + static int const smem_tot = smem_h * smem_stride; + + static int const load_x = tile_co > 32 ? 32 : tile_co; + static int const load_y = ThreadConfig::nr_threads / load_x; + + static int const reg_row = (smem_h + load_y - 1) / load_y; + static int const reg_col = (smem_w + load_x - 1) / load_x; + static bool const check_sh_bounds = smem_w % load_x != 0; +}; + +template +struct DataGlobal2ShareMemVisitor { + typedef DataTileCount TileCount; + typedef float copy_t; + float* smem; + const copy_t* g_ptr; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_row][TileCount::reg_col]; + + __device__ DataGlobal2ShareMemVisitor(copy_t* smem, int stride, int remain) + : smem{smem}, stride{stride}, remain{remain} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr(h_idx, w_idx)) = val; + } else { + *(sh_ptr(h_idx, w_idx)) = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + *(sh_ptr(h_idx, w_idx)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_ci * stride; + } +}; + +template +struct FilterGlobal2ShareMemVisitor { + typedef float copy_t; + typedef FilterTileCount TileCount; + float* smem; + const copy_t* g_ptr; + int stride; + int remain; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int tid = tidy * ThreadConfig::nr_thread_x + tidx; + const int gl_load_y = tid / TileCount::load_x; + const int gl_load_x = tid - gl_load_y * TileCount::load_x; + + copy_t reg[TileCount::reg_row][TileCount::reg_col]; + + __device__ FilterGlobal2ShareMemVisitor(copy_t* smem, int stride, + int remain) + : smem{smem}, stride{stride}, remain{remain} {} + + __device__ __forceinline__ void first_copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + *(sh_ptr(h_idx, w_idx)) = val; + } else { + *(sh_ptr(h_idx, w_idx)) = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void copy() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + if (check_bounds) { + copy_t val = 0.f; + if (w_idx < remain) { + val = g_ptr[h_idx * stride + w_idx]; + } + reg[i][j] = val; + } else { + reg[i][j] = g_ptr[h_idx * stride + w_idx]; + } + } + } + } + + __device__ __forceinline__ void commit() { +#pragma unroll + for (int i = 0; i < TileCount::reg_row; ++i) { + int h_idx = gl_load_y + i * TileCount::load_y; +#pragma unrol + for (int j = 0; j < TileCount::reg_col; ++j) { + int w_idx = gl_load_x + j * TileCount::load_x; + if (h_idx >= TileCount::smem_h) + continue; + if (TileCount::check_sh_bounds && w_idx >= TileCount::smem_w) + continue; + *(sh_ptr(h_idx, w_idx)) = reg[i][j]; + } + } + } + + __device__ __forceinline__ float* sh_ptr(int y, int x) { + return &smem[y * TileCount::smem_stride + x]; + } + + __device__ __forceinline__ void move_forward() { + g_ptr += UnrollConfig::unroll_ci * stride; + } +}; + +template +__device__ __forceinline__ void consume_block( + DataGlobal2ShareMemVisitor& + data_gl2sh_visitor, + FilterGlobal2ShareMemVisitor& + filter_gl2sh_visitor, + float r_src[UnrollConfig::unroll_n], + float r_filter[UnrollConfig::unroll_co], + float r_acc[UnrollConfig::unroll_co][UnrollConfig::unroll_n]) { + typedef DataTileCount DataTileCount; + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + +#pragma unroll + for (int ci_inner = 0; ci_inner < UnrollConfig::unroll_ci; ++ci_inner) { +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_n; ++i) { + r_src[i] = *(data_gl2sh_visitor.sh_ptr( + ci_inner, tidx + i * ThreadConfig::nr_thread_x)); + } +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_co; ++j) { + r_filter[j] = *(filter_gl2sh_visitor.sh_ptr( + ci_inner, tidy + j * ThreadConfig::nr_thread_y)); + } + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + r_acc[i][j] += r_src[j] * r_filter[i]; + } + } + } +} + +template +__global__ void local_share_device_template_f32( + const float* __restrict__ src, const float* __restrict__ filter, + float* __restrict__ dst, Param param, int fh, int fw, int sh, int sw) { + typedef DataTileCount DataTileCount; + typedef FilterTileCount FilterTileCount; + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + const int ho = param.sgh * param.grp_ho; + const int wo = param.sgw * param.grp_wo; + + const int b_ho = bidx / wo; + const int b_wo = bidx - wo * b_ho; + const int sgh_idx = b_ho / param.grp_ho; + const int sgw_idx = b_wo / param.grp_wo; + + const int b_batch = bidy * DataTileCount::tile_batch; + const int b_co = bidz * FilterTileCount::tile_co; + const int t_batch = tidx + b_batch; + const int t_co = tidy + b_co; + + extern __shared__ float smem[]; + + float* sh_src = smem; + float* sh_filter = smem + DataTileCount::smem_tot; + + const float* __restrict__ g_ptr_src = src + b_batch; + const float* __restrict__ g_ptr_filter = filter + b_co + // output channel + (sgh_idx * param.sgw + sgw_idx) * + param.co * param.ci * fh * + fw; // spatial group + + float* __restrict__ g_ptr_dst = + dst + t_co * ho * wo * param.n // output channel stride+ + + (b_ho * wo + b_wo) * param.n // spatial stride + + t_batch; + + // TODO check register + DataGlobal2ShareMemVisitor + src_gl2sh_visitor{sh_src, param.hi * param.wi * param.n, + param.n - b_batch}; + + FilterGlobal2ShareMemVisitor + filter_gl2sh_visitor{sh_filter, param.co * fh * fw, + param.co - b_co}; + + float r_src[UnrollConfig::unroll_n]; + float r_filter[UnrollConfig::unroll_co]; + float r_acc[UnrollConfig::unroll_co][UnrollConfig::unroll_n]; + +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + r_acc[i][j] = 0; + } + } + + int h_base = b_ho * sh - param.ph; + int w_base = b_wo * sw - param.pw; + int h_start = h_base >= 0 ? h_base : 0; + int w_start = w_base >= 0 ? w_base : 0; + int h_end = h_base + fh - 1; + int w_end = w_base + fw - 1; + h_end = h_end < param.hi ? h_end : param.hi - 1; + w_end = w_end < param.wi ? w_end : param.wi - 1; + const int ci_blks = + (param.ci + UnrollConfig::unroll_ci - 1) / UnrollConfig::unroll_ci; + + int kh = h_start - h_base; + int kw = w_start - w_base; + src_gl2sh_visitor.g_ptr = + g_ptr_src + (h_start * param.wi + w_start) * param.n; + filter_gl2sh_visitor.g_ptr = g_ptr_filter + (kh * fw + kw) * param.co; + src_gl2sh_visitor.first_copy(); + filter_gl2sh_visitor.first_copy(); + + __syncthreads(); + + for (int h = h_start; h <= h_end; ++h) { + for (int w = w_start; w <= w_end; ++w) { + for (int ci_outer = 0; ci_outer < ci_blks; ci_outer++) { + if (ci_outer == ci_blks - 1) { + if (!(h == h_end && w == w_end)) { + int w_next = w == w_end ? w_start : w + 1; + int h_next = w == w_end ? h + 1 : h; + int kh = h_next - h_base; + int kw = w_next - w_base; + src_gl2sh_visitor.g_ptr = + g_ptr_src + + (h_next * param.wi + w_next) * param.n; + filter_gl2sh_visitor.g_ptr = + g_ptr_filter + (kh * fw + kw) * param.co; + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + } else { + src_gl2sh_visitor.move_forward(); + filter_gl2sh_visitor.move_forward(); + src_gl2sh_visitor.copy(); + filter_gl2sh_visitor.copy(); + } + + consume_block( + src_gl2sh_visitor, filter_gl2sh_visitor, r_src, + r_filter, r_acc); + + if (!(ci_outer == ci_blks - 1 && h == h_end && w == w_end)) { + __syncthreads(); + src_gl2sh_visitor.commit(); + filter_gl2sh_visitor.commit(); + __syncthreads(); + } + } + } + } + + const int co_stride = ho * wo * param.n; +#pragma unroll + for (int i = 0; i < UnrollConfig::unroll_co; ++i) { +#pragma unroll + for (int j = 0; j < UnrollConfig::unroll_n; ++j) { + if (check_bounds && + (t_co + i * ThreadConfig::nr_thread_y >= param.co || + t_batch + j * ThreadConfig::nr_thread_x >= param.n)) { + } else { + g_ptr_dst[i * ThreadConfig::nr_thread_y * co_stride + + j * ThreadConfig::nr_thread_x] = r_acc[i][j]; + } + } + } +} + +void (*get_kern(const Param& param, LaunchConfig& launch_config))( + const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int) { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int); + kern = nullptr; +#define CHK3(n_, co_, ci_, tx_, ty_) \ + if (param.n >= n_) { \ + if (param.co >= co_) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int unroll_ci = (ci_); \ + static constexpr int unroll_co = (co_ + ty_ - 1) / ty_; \ + static constexpr int unroll_n = (n_ + tx_ - 1) / tx_; \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef UnrollConfig \ + UnrollConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef DataTileCount \ + DataTileCount; \ + typedef FilterTileCount \ + FilterTileCount; \ + kern = local_share_device_template_f32; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.grp_ho * param.grp_wo * param.sgh * param.sgw; \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, DataTileCount::tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, FilterTileCount::tile_co); \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + (DataTileCount::smem_tot + FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, co_) \ + CHK3(n_, co_, 4, 8, 16) \ + CHK3(n_, co_, 8, 8, 16) +#define CHK2_(n_, co_) \ + CHK3(n_, co_, 4, 8, 8) \ + CHK3(n_, co_, 8, 8, 8) +#define CHK(n_) \ + CHK2_(n_, 1) \ + CHK2_(n_, 8) CHK2_(n_, 16) CHK2_(n_, 32) CHK2_(n_, 64) CHK2(n_, 128) + CHK(1) + CHK(8); + CHK(16); + CHK(32); + CHK(64); +#undef CHK +#undef CHK2 +#undef CHK2_ +#undef CHK3 +#define CHK3(n_, co_, ci_, tx_, ty_) \ + if (param.n % n_ == 0) { \ + if (param.co % co_ == 0) { \ + if (param.ci % ci_ == 0) { \ + static constexpr int unroll_ci = (ci_); \ + static constexpr int unroll_co = (co_) / (ty_); \ + static constexpr int unroll_n = (n_) / (tx_); \ + static constexpr int thread_x = tx_; \ + static constexpr int thread_y = ty_; \ + typedef UnrollConfig \ + UnrollConfig; \ + typedef ThreadConfig ThreadConfig; \ + typedef DataTileCount \ + DataTileCount; \ + typedef FilterTileCount \ + FilterTileCount; \ + kern = local_share_device_template_f32; \ + launch_config.nr_threads_x = thread_x; \ + launch_config.nr_threads_y = thread_y; \ + launch_config.nr_threads_z = 1; \ + launch_config.nr_blocks_x = \ + param.grp_ho * param.grp_wo * param.sgh * param.sgw; \ + launch_config.nr_blocks_y = \ + DIVUP(param.n, DataTileCount::tile_batch); \ + launch_config.nr_blocks_z = \ + DIVUP(param.co, FilterTileCount::tile_co); \ + launch_config.smem_size_in_bytes = \ + sizeof(float) * \ + (DataTileCount::smem_tot + FilterTileCount::smem_tot); \ + } \ + } \ + } +#define CHK2(n_, co_) CHK3(n_, co_, 4, 8, 8) CHK3(n_, co_, 8, 8, 8) +#define CHK(n_) \ + CHK2(n_, 8) \ + CHK2(n_, 16) \ + CHK2(n_, 32) CHK2(n_, 64) CHK3(n_, 128, 4, 8, 16) CHK3(n_, 128, 8, 8, 16) + CHK(8); + CHK(16); + CHK(32); + CHK(64); +#undef CHK +#undef CHK2 +#undef CHK3 + megdnn_assert(kern != nullptr, + "no usable kernel implementation for local share " + "convolution (batch,co,ci)=(%d,%d,%d)", + param.n, param.co, param.ci); + return kern; +} + +} // namespace + +void megdnn::cuda::local_share:: + _do_local_share_convolution_large_batch_size_small_image( + const float* d_src, const float* d_filter, float* d_dst, + float* workspace, int fh, int fw, int sh, int sw, + const Param& param, cublasHandle_t cublas_handle, + cudaStream_t stream, float* one, float* zero) { + float* ws_src = workspace; + int nr_src_total = param.n * param.ci * param.hi * param.wi; + float* ws_dst = ws_src + nr_src_total; + // tensor reformat from (n, c, h, w) -> (c, h, w, n) + { + int m = param.n, n = param.ci * param.hi * param.wi; + int lda, ldb; + lda = ldb = param.ci * param.hi * param.wi; + int ldc = param.n; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, d_src, lda, zero, d_src, ldb, ws_src, + ldc)); + } + + { + void (*kern)(const float* __restrict__, const float* __restrict__, + float* __restrict__, Param, int, int, int, int); + LaunchConfig launch_config; + kern = get_kern(param, launch_config); + + uint32_t nr_threads_x = launch_config.nr_threads_x, + nr_threads_y = launch_config.nr_threads_y, + nr_blocks_x = launch_config.nr_blocks_x, + nr_blocks_y = launch_config.nr_blocks_y, + nr_blocks_z = launch_config.nr_blocks_z, + smem_size_in_bytes = launch_config.smem_size_in_bytes; + _check_launch_config(launch_config); + + dim3 block_size{nr_threads_x, nr_threads_y, 1}; + dim3 grid_size{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + + kern<<>>( + ws_src, d_filter, ws_dst, param, fh, fw, sh, sw); + after_kernel_launch(); + } + + // tensor reformat form (c, h, w, n) -> (n, c, h, w) + { + int ho = param.grp_ho * param.sgh, wo = param.grp_wo * param.sgw; + int m = param.co * ho * wo, n = param.n; + int lda, ldb; + lda = ldb = param.n; + int ldc = param.co * ho * wo; + cublas_check(cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, + one, ws_dst, lda, zero, ws_dst, ldb, d_dst, + ldc)); + } +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/helper.cpp b/dnn/src/cuda/local_share/helper.cpp new file mode 100644 index 00000000..e41dbe09 --- /dev/null +++ b/dnn/src/cuda/local_share/helper.cpp @@ -0,0 +1,53 @@ +/** + * \file dnn/src/cuda/local_share/helper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./helper.cuh" +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { +namespace local_share { + +void _check_launch_config(const local_share::LaunchConfig& launch_config) { + auto&& device_prop = current_device_prop(); + int x_thread_limit = device_prop.maxThreadsDim[0]; + int y_thread_limit = device_prop.maxThreadsDim[1]; + int z_thread_limit = device_prop.maxThreadsDim[2]; + int x_grid_limit = device_prop.maxGridSize[0]; + int y_grid_limit = device_prop.maxGridSize[1]; + int z_grid_limit = device_prop.maxGridSize[2]; + int sh_mem_size_limit = device_prop.sharedMemPerBlock; + MEGDNN_MARK_USED_VAR(x_thread_limit); + MEGDNN_MARK_USED_VAR(y_thread_limit); + MEGDNN_MARK_USED_VAR(z_thread_limit); + MEGDNN_MARK_USED_VAR(x_grid_limit); + MEGDNN_MARK_USED_VAR(y_grid_limit); + MEGDNN_MARK_USED_VAR(z_grid_limit); + MEGDNN_MARK_USED_VAR(sh_mem_size_limit); + megdnn_assert(launch_config.nr_threads_x <= x_thread_limit); + megdnn_assert(launch_config.nr_threads_y <= y_thread_limit); + megdnn_assert(launch_config.nr_threads_z <= z_thread_limit); + megdnn_assert(launch_config.nr_blocks_x <= x_grid_limit); + megdnn_assert(launch_config.nr_blocks_y <= y_grid_limit); + megdnn_assert(launch_config.nr_blocks_z <= z_grid_limit); + megdnn_assert(launch_config.smem_size_in_bytes <= sh_mem_size_limit); +} + +uint32_t _get_kern_block_size(const void* kern) { + uint32_t ret = query_blocksize_for_kernel(kern); + return ret; +} + +} // namespace local_share +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/helper.cuh b/dnn/src/cuda/local_share/helper.cuh new file mode 100644 index 00000000..27468eff --- /dev/null +++ b/dnn/src/cuda/local_share/helper.cuh @@ -0,0 +1,89 @@ +/** + * \file dnn/src/cuda/local_share/helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace local_share { + +struct Param { + int n, co, ci, hi, wi, ph, pw, grp_ho, grp_wo, sgh, sgw; +}; + +struct LaunchConfig { + int nr_threads_x; + int nr_threads_y; + int nr_threads_z; + int nr_blocks_x; + int nr_blocks_y; + int nr_blocks_z; + int smem_size_in_bytes; + LaunchConfig() + : nr_threads_x{1}, + nr_threads_y{1}, + nr_threads_z{1}, + nr_blocks_x{1}, + nr_blocks_y{1}, + nr_blocks_z{1}, + smem_size_in_bytes{1} {} +}; + +template +struct LocalShareConfig { + static int const fh = fh_; + static int const fw = fw_; + static int const sh = sh_; + static int const sw = sw_; +}; + +void _check_launch_config(const LaunchConfig& launch_config); + +uint32_t _get_kern_block_size(const void* kern); + +} // namespace local_share +} // namespace cuda +} // namespace megdnn + +#define unpack_local_share_params(_src, _filter, _dst, _param) \ + size_t n = _src[0], ci = _src[1], hi = _src[2], wi = _src[3]; \ + size_t weight_spatial_pos; \ + if (_param.sparse == LocalShare::Param::Sparse::DENSE) { \ + weight_spatial_pos = 3; \ + } else { \ + megdnn_assert(_param.sparse == LocalShare::Param::Sparse::GROUP); \ + weight_spatial_pos = 4; \ + } \ + size_t fh = _filter[weight_spatial_pos], \ + fw = _filter[weight_spatial_pos + 1]; \ + size_t co = _dst[1], ho = _dst[2], wo = _dst[3]; \ + size_t ph = _param.pad_h, pw = _param.pad_w; \ + size_t sh = _param.stride_h, sw = _param.stride_w; \ + size_t dh = _param.dilate_h, dw = _param.dilate_w; \ + size_t sgh = _param.spatial_groups_h, sgw = _param.spatial_groups_w; \ + MEGDNN_MARK_USED_VAR(n); \ + MEGDNN_MARK_USED_VAR(ci); \ + MEGDNN_MARK_USED_VAR(hi); \ + MEGDNN_MARK_USED_VAR(wi); \ + MEGDNN_MARK_USED_VAR(co); \ + MEGDNN_MARK_USED_VAR(fh); \ + MEGDNN_MARK_USED_VAR(fw); \ + MEGDNN_MARK_USED_VAR(ho); \ + MEGDNN_MARK_USED_VAR(wo); \ + MEGDNN_MARK_USED_VAR(ph); \ + MEGDNN_MARK_USED_VAR(pw); \ + MEGDNN_MARK_USED_VAR(sh); \ + MEGDNN_MARK_USED_VAR(sw); \ + MEGDNN_MARK_USED_VAR(dh); \ + MEGDNN_MARK_USED_VAR(dw); \ + MEGDNN_MARK_USED_VAR(sgh); \ + MEGDNN_MARK_USED_VAR(sgw); + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/im2col.cu b/dnn/src/cuda/local_share/im2col.cu new file mode 100644 index 00000000..0523571f --- /dev/null +++ b/dnn/src/cuda/local_share/im2col.cu @@ -0,0 +1,172 @@ +/** + * \file dnn/src/cuda/local_share/im2col.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./im2col.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace local_share; + +namespace { +template +__global__ void local_share_im2col(const T* __restrict__ img, + T* __restrict__ col, int fh, int fw, int sh, + int sw, int nr_groups, Param param) { + const int in_ch_idx = threadIdx.x + blockIdx.y * blockDim.x; + const int batch = threadIdx.y + blockIdx.z * blockDim.y; + if (in_ch_idx >= param.ci || batch >= param.n) + return; + const int hw = blockIdx.x; + const int wo = param.grp_wo * param.sgw; + const int oh_idx = hw / wo; + const int ow_idx = hw - oh_idx * wo; + const int sgh_idx = oh_idx / param.grp_ho; + const int sgw_idx = ow_idx / param.grp_wo; + const int grp_oh_idx = oh_idx - sgh_idx * param.grp_ho; + const int grp_ow_idx = ow_idx - sgw_idx * param.grp_wo; + const int grp_sizes = param.grp_ho * param.grp_wo; + const int icpg = param.ci / nr_groups; + const int ch_grp_idx = in_ch_idx / icpg; + const int grp_ch_idx = in_ch_idx - icpg * ch_grp_idx; + + const T* __restrict__ img_ptr = img + + batch * param.ci * param.hi * param.wi + + in_ch_idx * param.hi * param.wi; + const int ld = icpg * fh * fw; + T* __restrict__ col_ptr = + col + + ch_grp_idx * (param.sgh * param.sgw) * param.n * grp_sizes * + ld // channel group stride + + (sgh_idx * param.sgw + sgw_idx) * param.n * grp_sizes * + ld // batch stride + + grp_ch_idx * fh * fw // input channel stride + + (batch * grp_sizes + (grp_oh_idx * param.grp_wo + grp_ow_idx)) * + ld; // row stride + + for (int kh = 0; kh < fh; kh++) { + for (int kw = 0; kw < fw; kw++) { + int ih_idx = oh_idx * sh - param.ph + kh; + int iw_idx = ow_idx * sw - param.pw + kw; + float val = 0.f; + if (ih_idx < param.hi && ih_idx >= 0 && iw_idx < param.wi && + iw_idx >= 0) { + val = img_ptr[ih_idx * param.wi + iw_idx]; + } + *(col_ptr++) = val; + } + } +} + +template +__global__ void local_share_col2im(const T* __restrict__ col, + T* __restrict__ img, int fh, int fw, int sh, + int sw, int nr_groups, Param param) { + const int batch = threadIdx.x + blockIdx.y * blockDim.x; + const int in_ch_idx = threadIdx.y + blockIdx.z * blockDim.y; + if (in_ch_idx >= param.ci || batch >= param.n) + return; + const int hw = blockIdx.x; + const int ih_idx = hw / param.wi; + const int iw_idx = hw - ih_idx * param.wi; + const int ho = param.grp_ho * param.sgh; + const int wo = param.grp_wo * param.sgw; + const int icpg = param.ci / nr_groups; + const int grp_sizes = param.grp_ho * param.grp_wo; + const int filter_sizes = fh * fw; + const int ch_filter_sizes = icpg * filter_sizes; + const int nr_elems_per_grp = param.n * grp_sizes * ch_filter_sizes; + const int ch_grp_idx = in_ch_idx / icpg; + const int grp_ch_idx = in_ch_idx - icpg * ch_grp_idx; + const T* __restrict__ col_ptr = + col + + ch_grp_idx * param.sgh * param.sgw * ch_filter_sizes * grp_sizes * + param.n // channel group stride + + batch // batch stride + + + grp_ch_idx * filter_sizes * grp_sizes * param.n; // channel stride + + T res(0); + for (int kh = 0; kh < fh; ++kh) { + uint32_t anchorh = ih_idx + param.ph - kh; + if (anchorh < ho * sh && anchorh % sh == 0) { + int oh_idx = anchorh / sh; + int sgh_idx = oh_idx / param.grp_ho; + int grp_oh_idx = oh_idx - sgh_idx * param.grp_ho; + for (int kw = 0; kw < fw; ++kw) { + uint32_t anchorw = iw_idx + param.pw - kw; + if (anchorw < wo * sw && anchorw % sw == 0) { + int ow_idx = anchorw / sw; + int sgw_idx = ow_idx / param.grp_wo; + int grp_ow_idx = ow_idx - sgw_idx * param.grp_wo; + const T* __restrict__ sptr = + col_ptr + + (sgh_idx * param.sgw + sgw_idx) * + nr_elems_per_grp // spatial group stride + + (grp_oh_idx * param.grp_wo + grp_ow_idx) * + param.n // spatial stride + + (kh * fw + kw) * grp_sizes * param.n; + res += sptr[0]; + } + } + } + } + img[batch * param.ci * param.hi * param.wi + + in_ch_idx * param.hi * param.wi + ih_idx * param.wi + iw_idx] = res; +} + +} // namespace + +void megdnn::cuda::local_share::_do_local_share_im2col( + const float* d_im, float* d_col, int fh, int fw, int sh, int sw, + int nr_groups, const Param& param, cudaStream_t stream) { + void (*kern)(const float* __restrict__, float* __restrict__, int, int, int, + int, int, Param); + kern = local_share_im2col; + + constexpr int threads_x = 256; + uint32_t nr_threads = + _get_kern_block_size(reinterpret_cast(kern)); + uint32_t nr_threads_x = std::min(threads_x, param.ci); + uint32_t nr_threads_y = + std::min(static_cast(nr_threads / nr_threads_x), param.n); + uint32_t nr_blocks_x = param.sgw * param.sgh * param.grp_ho * param.grp_wo, + nr_blocks_y = DIVUP(param.ci, nr_threads_x), + nr_blocks_z = DIVUP(param.n, nr_threads_y); + dim3 threads{nr_threads_x, nr_threads_y, 1}; + dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + kern<<>>(d_im, d_col, fh, fw, sh, sw, nr_groups, + param); + after_kernel_launch(); +} + +void megdnn::cuda::local_share::_do_local_share_col2im( + const float* d_col, float* d_im, int fh, int fw, int sh, int sw, + int nr_groups, const Param& param, cudaStream_t stream) { + void (*kern)(const float* __restrict__, float* __restrict__, int, int, int, + int, int, Param); + kern = local_share_col2im; + + constexpr int threads_x = 256; + uint32_t nr_threads = + _get_kern_block_size(reinterpret_cast(kern)); + uint32_t nr_threads_x = std::min(threads_x, param.n); + uint32_t nr_threads_y = + std::min(static_cast(nr_threads / nr_threads_x), param.ci); + uint32_t nr_blocks_x = param.hi * param.wi, + nr_blocks_y = DIVUP(param.n, nr_threads_x), + nr_blocks_z = DIVUP(param.ci, nr_threads_y); + dim3 threads{nr_threads_x, nr_threads_y, 1}; + dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + kern<<>>(d_col, d_im, fh, fw, sh, sw, nr_groups, + param); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/im2col.cuh b/dnn/src/cuda/local_share/im2col.cuh new file mode 100644 index 00000000..702a5a4f --- /dev/null +++ b/dnn/src/cuda/local_share/im2col.cuh @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/local_share/im2col.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/utils.cuh" +#include "./helper.cuh" + +namespace megdnn { +namespace cuda { +namespace local_share { + +void _do_local_share_im2col(const float* d_im, float* d_col, int fh, int fw, + int sh, int sw, int nr_groups, const Param& param, + cudaStream_t stream); + +void _do_local_share_col2im(const float* d_col, float* d_im, int fh, int fw, + int sh, int sw, int nr_groups, const Param& param, + cudaStream_t stream); +} // namespace local_share +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/local_share/opr_impl.cpp b/dnn/src/cuda/local_share/opr_impl.cpp new file mode 100644 index 00000000..054ffa48 --- /dev/null +++ b/dnn/src/cuda/local_share/opr_impl.cpp @@ -0,0 +1,177 @@ +/** + * \file dnn/src/cuda/local_share/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/local_share/opr_impl.h" +#include "./forward/algo.h" +#include "./backward_data/algo.h" +#include "./backward_filter/algo.h" +#include "src/common/algo_chooser.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +/* ============== LocalShareForwardImpl ============== */ +LocalShareForwardImpl::Algorithm* +LocalShareForwardImpl::get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, src, filter, dst); + if (sm_algo_pack.batch_size_aware_chwn_small_image + .is_available_reproducible(args, reproducible, + workspace_limit_in_bytes)) { + return &sm_algo_pack.batch_size_aware_chwn_small_image; + } + if (sm_algo_pack.batch_size_aware_chwn.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.batch_size_aware_chwn; + } + if (sm_algo_pack.batched_matmul.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.batched_matmul; + } + megdnn_throw(megdnn_mangle( + ssprintf("no %s local share conv algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", + args.to_string().c_str(), workspace_limit_in_bytes))); +} + +std::vector +LocalShareForwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) { + AlgoBase::SizeArgs args{this, src, filter, dst}; + return megdnn::get_all_algorithms(args); +} + +size_t LocalShareForwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) { + AlgoBase::SizeArgs args(this, src, filter, dst); + return get_algorithm(this, src, filter, dst)->get_workspace_in_bytes(args); +} + +void LocalShareForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, src, filter, dst, workspace); + auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); + algo->check_workspace(args, workspace).exec(args); +} + +const char* LocalShareForwardImpl::get_algorithm_set_name() const { + return "CUDA_LOCAL_SHARE_CONV"; +} + +/* ============== LocalShareBackwardDataImpl ============== */ +LocalShareBackwardDataImpl::Algorithm* +LocalShareBackwardDataImpl::get_algorithm_heuristic( + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, filter, diff, grad); + if (sm_algo_pack.implicit_gemm.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.implicit_gemm; + } + if (sm_algo_pack.batched_matmul.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.batched_matmul; + } + megdnn_throw(megdnn_mangle( + ssprintf("no %s local share bwd data algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", + args.to_string().c_str(), workspace_limit_in_bytes))); +} + +std::vector +LocalShareBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args{this, filter, diff, grad}; + return megdnn::get_all_algorithms(args); +} + +size_t LocalShareBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args(this, filter, diff, grad); + return get_algorithm(this, filter, diff, grad)->get_workspace_in_bytes(args); +} + +void LocalShareBackwardDataImpl::exec(_megdnn_tensor_in filter, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); + auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); + algo->check_workspace(args, workspace).exec(args); +} + +const char* LocalShareBackwardDataImpl::get_algorithm_set_name() const { + return "CUDA_LOCAL_SHARE_CONV_BWD_DATA"; +} + +/* ============== LocalShareBackwardFilterImpl ============== */ +LocalShareBackwardFilterImpl::Algorithm* +LocalShareBackwardFilterImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, size_t workspace_limit_in_bytes, + bool reproducible) { + AlgoBase::SizeArgs args(this, src, diff, grad); + if (sm_algo_pack.implicit_gemm.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.implicit_gemm; + } + if (sm_algo_pack.batched_matmul.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.batched_matmul; + } + megdnn_throw(megdnn_mangle( + ssprintf("no %s local share bwd filter algorithm with args(%s) and " + "workspace limit (%zu bytes)", + reproducible ? "reproducible" : "usable", + args.to_string().c_str(), workspace_limit_in_bytes))); +} + +std::vector +LocalShareBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args{this, src, diff, grad}; + return megdnn::get_all_algorithms(args); +} + +size_t LocalShareBackwardFilterImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args(this, src, diff, grad); + return get_algorithm(this, src, diff, grad)->get_workspace_in_bytes(args); +} + +void LocalShareBackwardFilterImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { + AlgoBase::ExecArgs args(this, src, diff, grad, workspace); + auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout); + algo->check_workspace(args, workspace).exec(args); +} + +const char* LocalShareBackwardFilterImpl::get_algorithm_set_name() const { + return "CUDA_LOCAL_SHARE_CONV_BWD_FILTER"; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/opr_impl.h b/dnn/src/cuda/local_share/opr_impl.h new file mode 100644 index 00000000..76aba387 --- /dev/null +++ b/dnn/src/cuda/local_share/opr_impl.h @@ -0,0 +1,112 @@ +/** + * \file dnn/src/cuda/local_share/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +class LocalShareForwardImpl : public LocalShareForward { +public: + using LocalShareForward::LocalShareForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) override; + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst, + size_t workspace_limit_in_bytes, + bool reproducible) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoCHWNBatchSizeAware; + class AlgoCHWNBatchSizeAwareSmallImage; + class AlgoBatchedMatMul; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +class LocalShareBackwardDataImpl : public LocalShareBackwardData { +public: + using LocalShareBackwardData::LocalShareBackwardData; + void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) override; + std::vector get_all_algorithms( + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoImplicitGemm; + class AlgoBatchedMatMul; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +class LocalShareBackwardFilterImpl : public LocalShareBackwardFilter { +public: + using LocalShareBackwardFilter::LocalShareBackwardFilter; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) override; + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) override; + const char* get_algorithm_set_name() const override; + + class AlgoBase; + class AlgoImplicitGemm; + class AlgoBatchedMatMul; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/lrn/opr_impl.cpp b/dnn/src/cuda/lrn/opr_impl.cpp new file mode 100644 index 00000000..9310d7ff --- /dev/null +++ b/dnn/src/cuda/lrn/opr_impl.cpp @@ -0,0 +1,79 @@ +/** + * \file dnn/src/cuda/lrn/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/lrn/opr_impl.h" + +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +void LRNForwardImpl::setup_descs(const TensorLayout &src, + const TensorLayout &dst) +{ + src_desc.set(src); + dst_desc.set(dst); + lrn_desc.set(this->param()); +} + +void LRNForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, workspace.size); + auto handle = cudnn_handle(this->handle()); + setup_descs(src.layout, dst.layout); + float alpha = 1.0f, beta = 0.0f; + cudnn_check(cudnnLRNCrossChannelForward(handle, + lrn_desc.desc, + CUDNN_LRN_CROSS_CHANNEL_DIM1, + &alpha, src_desc.desc, src.raw_ptr, + &beta, dst_desc.desc, dst.raw_ptr)); +} + +void LRNBackwardImpl::setup_descs(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &diff, + const TensorLayout &grad) +{ + src_desc.set(src); + dst_desc.set(dst); + diff_desc.set(diff); + grad_desc.set(grad); + lrn_desc.set(this->param()); +} + +void LRNBackwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in dst, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, diff.layout, grad.layout, + workspace.size); + auto handle = cudnn_handle(this->handle()); + setup_descs(src.layout, dst.layout, diff.layout, grad.layout); + float alpha = 1.0f, beta = 0.0f; + cudnn_check(cudnnLRNCrossChannelBackward(handle, + lrn_desc.desc, + CUDNN_LRN_CROSS_CHANNEL_DIM1, + &alpha, + dst_desc.desc, dst.raw_ptr, + diff_desc.desc, diff.raw_ptr, + src_desc.desc, src.raw_ptr, + &beta, + grad_desc.desc, grad.raw_ptr)); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/lrn/opr_impl.h b/dnn/src/cuda/lrn/opr_impl.h new file mode 100644 index 00000000..251b6535 --- /dev/null +++ b/dnn/src/cuda/lrn/opr_impl.h @@ -0,0 +1,61 @@ +/** + * \file dnn/src/cuda/lrn/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/cuda/cudnn_wrapper.h" + +namespace megdnn { +namespace cuda { + +class LRNForwardImpl final: public LRNForward { + public: + using LRNForward::LRNForward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &) override { + return 0; + } + private: + TensorDesc src_desc, dst_desc; + LRNDesc lrn_desc; + void setup_descs(const TensorLayout &src, const TensorLayout &dst); +}; + +class LRNBackwardImpl final: public LRNBackward { + public: + using LRNBackward::LRNBackward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in dst, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &, + const TensorLayout &, + const TensorLayout &) override { + return 0; + } + private: + TensorDesc src_desc, dst_desc, diff_desc, grad_desc; + LRNDesc lrn_desc; + void setup_descs(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &diff, + const TensorLayout &grad); +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/mask_conv/mask_conv.cu b/dnn/src/cuda/mask_conv/mask_conv.cu new file mode 100644 index 00000000..8514cf8e --- /dev/null +++ b/dnn/src/cuda/mask_conv/mask_conv.cu @@ -0,0 +1,102 @@ +/** + * \file dnn/src/cuda/mask_conv/mask_conv.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include "./mask_conv.cuh" +#include "megdnn/dtype.h" +#include "src/cuda/utils.cuh" + +namespace { +template +__global__ void set_zero_by_mask_kernel(float* dst, const ctype* mask, size_t N, + size_t mask_size) { + int dst_offset = blockIdx.x * blockDim.x + threadIdx.x; + int mask_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (dst_offset >= N || mask_idx >= mask_size) { + return; + } + if (mask[mask_idx] == 0) { + dst[dst_offset * mask_size + mask_idx] = 0; + } +} + +template +__global__ void mask_propagate_kernel(const ctype* src, ctype* dst, size_t IH, + size_t IW, size_t OH, size_t OW, + size_t FH, size_t FW, size_t SH, + size_t SW, size_t PH, size_t PW, + size_t DH, size_t DW) { + int dst_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (dst_idx >= OH * OW) { + return; + } + int oh = dst_idx / OW; + int ow = dst_idx - (OW * oh); + dst[dst_idx] = 0; + for (int fh = 0; fh < FH; ++fh) { + for (int fw = 0; fw < FW; ++fw) { + int ih = oh * SH + fh * DH - PH; + int iw = ow * SW + fw * DW - PW; + if (ih < 0 || ih >= IH || iw < 0 || iw >= IW || + src[ih * IW + iw] == 0) { + continue; + } + dst[dst_idx] = 1; + return; + } + } +} + +} // namespace + +namespace megdnn { +namespace cuda { +namespace mask_conv { + +template +void set_zero_by_mask_proxy(float* dst, const ctype* mask, size_t N, size_t OC, + size_t OH, size_t OW, cudaStream_t stream) { + dim3 threads(NR_THREADS_X, NR_THREADS_Y); + dim3 blocks(DIVUP(N * OC, threads.x), DIVUP(OH * OW, threads.y)); + set_zero_by_mask_kernel + <<>>(dst, mask, N * OC, OH * OW); +} + +template +void mask_propagate_exec_proxy(const ctype* src, ctype* dst, size_t IH, + size_t IW, size_t OH, size_t OW, size_t FH, + size_t FW, size_t SH, size_t SW, size_t PH, + size_t PW, size_t DH, size_t DW, + cudaStream_t stream) { + mask_propagate_kernel + <<>>( + src, dst, IH, IW, OH, OW, FH, FW, SH, SW, PH, PW, DH, DW); +} + +#define INST(ctype) \ + template void mask_propagate_exec_proxy( \ + const ctype* src, ctype* dst, size_t IH, size_t IW, size_t OH, \ + size_t OW, size_t FH, size_t FW, size_t SH, size_t SW, size_t PH, \ + size_t PW, size_t DH, size_t DW, cudaStream_t stream); \ + \ + template void set_zero_by_mask_proxy( \ + float* dst, const ctype* mask, size_t N, size_t OC, size_t OH, \ + size_t OW, cudaStream_t stream); + +#define cb(DType) INST(DTypeTrait::ctype) +MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) +#undef cb + +#undef INST + +} // namespace mask_conv +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/mask_conv/mask_conv.cuh b/dnn/src/cuda/mask_conv/mask_conv.cuh new file mode 100644 index 00000000..2113d242 --- /dev/null +++ b/dnn/src/cuda/mask_conv/mask_conv.cuh @@ -0,0 +1,30 @@ +/** + * \file dnn/src/cuda/mask_conv/mask_conv.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +namespace megdnn { +namespace cuda { +namespace mask_conv { + +template +void set_zero_by_mask_proxy(float* dst, const ctype* mask, size_t N, size_t OC, + size_t OH, size_t OW, cudaStream_t stream); + +template +void mask_propagate_exec_proxy(const ctype* src, ctype* dst, size_t IH, + size_t IW, size_t OH, size_t OW, size_t FH, + size_t FW, size_t SH, size_t SW, size_t PH, + size_t PW, size_t DH, size_t DW, + cudaStream_t stream); + +} // namespace mask_conv + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/mask_conv/opr_impl.cpp b/dnn/src/cuda/mask_conv/opr_impl.cpp new file mode 100644 index 00000000..6c34f0d0 --- /dev/null +++ b/dnn/src/cuda/mask_conv/opr_impl.cpp @@ -0,0 +1,68 @@ +/** + * \file dnn/src/cuda/mask_conv/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/mask_conv/opr_impl.h" +#include "./mask_conv.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +MaskConvForwardImpl::MaskConvForwardImpl(Handle* handle) + : MaskConvForward(handle) { + m_conv_opr = static_cast(handle) + ->create_operator(); +} + +void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in mask, _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + megdnn_assert(dst.layout.dtype.enumv() == DTypeTrait::enumv, + "Mask conv only support Float32 dtype."); + m_conv_opr->exec(src, filter, dst, workspace); + auto stream = cuda_stream(handle()); +#define cb(DType) \ + if (mask.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + mask_conv::set_zero_by_mask_proxy( \ + dst.ptr(), mask.ptr(), dst.layout[0], \ + dst.layout[1], dst.layout[2], dst.layout[3], stream); \ + return; \ + } + + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb) +#undef cb + megdnn_assert_internal(0); +} + +void MaskPropagateImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace) { + auto stream = cuda_stream(handle()); + +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + mask_conv::mask_propagate_exec_proxy( \ + src.ptr(), dst.ptr(), src.layout[0], \ + src.layout[1], dst.layout[0], dst.layout[1], param().kernel_h, \ + param().kernel_w, param().stride_h, param().stride_w, \ + param().pad_h, param().pad_w, param().dilate_h, \ + param().dilate_w, stream); \ + return; \ + } + + MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb); +#undef cb + megdnn_assert_internal(0); +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/mask_conv/opr_impl.h b/dnn/src/cuda/mask_conv/opr_impl.h new file mode 100644 index 00000000..a9b5e53b --- /dev/null +++ b/dnn/src/cuda/mask_conv/opr_impl.h @@ -0,0 +1,53 @@ +/** + * \file dnn/src/cuda/mask_conv/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs.h" +#include "src/cuda/handle.h" + +namespace megdnn { +namespace cuda { + +class MaskConvForwardImpl : public MaskConvForward { +public: + MaskConvForwardImpl(Handle* handle); + + void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in mask, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& mask, + const TensorLayout& dst) override { + MEGDNN_MARK_USED_VAR(mask); + m_conv_opr->param() = param(); + return m_conv_opr->get_workspace_in_bytes(src, filter, dst); + } + +private: + std::unique_ptr m_conv_opr; +}; + +class MaskPropagateImpl : public MaskPropagate { +public: + MaskPropagateImpl(Handle* handle) : MaskPropagate(handle) {} + + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace worksapce) override final; + size_t get_workspace_in_bytes(const TensorLayout&, + const TensorLayout&) override final { + return 0; + } +}; + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/matrix_inverse/helper.cu b/dnn/src/cuda/matrix_inverse/helper.cu new file mode 100644 index 00000000..9fff1b6c --- /dev/null +++ b/dnn/src/cuda/matrix_inverse/helper.cu @@ -0,0 +1,48 @@ +/** + * \file dnn/src/cuda/matrix_inverse/helper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./helper.cuh" +#include "src/cuda/error_info.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace matrix_inverse; + +namespace { + +__global__ void kern_check_error(const int* src_info, uint32_t n, + megcore::AsyncErrorInfo* dst_info, + void* tracker) { + uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n && src_info[i]) { + set_async_error_info(dst_info, tracker, + "The U is exactly singular and the inversion " + "failed on %d-th input matrix (U(%d, %d) = 0)", i, + src_info[i], src_info[i]); + } +} + +} // anonymous namespace + +void matrix_inverse::check_error(const int* src_info, uint32_t n, + megcore::AsyncErrorInfo* dst_info, + void* tracker, cudaStream_t stream) { + if (!dst_info) { + return; + } + uint32_t threads = NR_THREADS; + uint32_t blocks = DIVUP(n, threads); + kern_check_error<<>>(src_info, n, dst_info, + tracker); + after_kernel_launch(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_inverse/helper.cuh b/dnn/src/cuda/matrix_inverse/helper.cuh new file mode 100644 index 00000000..8027da99 --- /dev/null +++ b/dnn/src/cuda/matrix_inverse/helper.cuh @@ -0,0 +1,28 @@ +/** + * \file dnn/src/cuda/matrix_inverse/helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megcore_cdefs.h" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace matrix_inverse { + +void check_error(const int* src_info, uint32_t n, + megcore::AsyncErrorInfo* dst_info, void* tracker, + cudaStream_t stream); + +} // namespace matrix_inverse +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_inverse/opr_impl.cpp b/dnn/src/cuda/matrix_inverse/opr_impl.cpp new file mode 100644 index 00000000..93ceb55d --- /dev/null +++ b/dnn/src/cuda/matrix_inverse/opr_impl.cpp @@ -0,0 +1,52 @@ +/** + * \file dnn/src/cuda/matrix_inverse/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./helper.cuh" +#include "./opr_impl.h" +#include "src/cuda/batched_matrix_mul/helper.cuh" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +size_t MatrixInverseImpl::get_workspace_in_bytes(size_t batch, size_t, size_t) { + return batch * (sizeof(int) + sizeof(void*) + sizeof(void*)); +} + +void MatrixInverseImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + megdnn_assert(src.layout.dtype == dtype::Float32(), + "Matrix Inverse only support Float32 dtype, got: %s", + src.layout.dtype.name()); + size_t batch, n; + check_exec(src.layout, dst.layout, workspace, &batch, &n); + auto handle = concrete_handle(this->handle()); + megdnn_assert(n < 32, "currently only n < 32 supported on cuda"); + const float** psrc_batch = workspace.ptr(); + float** pdst_batch = const_cast(psrc_batch + batch); + int* info = reinterpret_cast(pdst_batch + batch); + auto stream = handle->stream(); + batched_matrix_mul::arange( + reinterpret_cast(psrc_batch), + reinterpret_cast(src.raw_ptr), n * n * sizeof(float), + batch, stream); + batched_matrix_mul::arange( + reinterpret_cast(pdst_batch), + reinterpret_cast(dst.raw_ptr), n * n * sizeof(float), + batch, stream); + cublas_check(cublasSmatinvBatched(handle->cublas_handle(), n, psrc_batch, n, + pdst_batch, n, info, batch)); + matrix_inverse::check_error(info, batch, + handle->megcore_context().error_info, + m_error_tracker, stream); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_inverse/opr_impl.h b/dnn/src/cuda/matrix_inverse/opr_impl.h new file mode 100644 index 00000000..5c355a82 --- /dev/null +++ b/dnn/src/cuda/matrix_inverse/opr_impl.h @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/matrix_inverse/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs/linalg.h" + +namespace megdnn { +namespace cuda { + +class MatrixInverseImpl : public MatrixInverse { +public: + using MatrixInverse::MatrixInverse; + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } + +protected: + void* m_error_tracker = nullptr; + size_t get_workspace_in_bytes(size_t batch, size_t n, + size_t dtype_size) override; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/algos.cpp b/dnn/src/cuda/matrix_mul/algos.cpp new file mode 100644 index 00000000..9ef84c06 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/algos.cpp @@ -0,0 +1,66 @@ +/** + * \file dnn/src/cuda/matrix_mul/algos.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algos.h" +#include "src/cuda/utils.h" + +#include +#if CUDA_VERSION >= 10010 +#include +#endif + +using namespace megdnn; +using namespace cuda; + +MatrixMulForwardImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&cublas); +#if CUDA_VERSION >= 10000 + all_algos.push_back(&wmma_uint4x4x32); +#endif +#if CUDA_VERSION >= 10010 + all_algos.push_back(&cublas_lt); +#endif + all_algos.push_back(&naive); +} + +MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack; + +MatrixMulForwardImpl::AlgoBase::SizeArgs::SizeArgs(MatrixMulForwardImpl* o, + const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) + : opr{o}, layout_a{A}, layout_b{B}, layout_c{C} {} + +MatrixMulForwardImpl::AlgoBase::ExecArgs::ExecArgs(MatrixMulForwardImpl* opr, + _megdnn_tensor_in A, + _megdnn_tensor_in B, + _megdnn_tensor_out C, + _megdnn_workspace workspace) + : SizeArgs(opr, A.layout, B.layout, C.layout), + tensor_a{A}, + tensor_b{B}, + tensor_c{C}, + workspace{workspace} {} + +std::string MatrixMulForwardImpl::AlgoBase::SizeArgs::to_string() const { + auto&& param = opr->param(); + size_t m = layout_a.shape[0], n = layout_b.shape[1], + k = layout_a.shape[param.transposeA ? 0 : 1]; + MEGDNN_MARK_USED_VAR(m); + MEGDNN_MARK_USED_VAR(n); + MEGDNN_MARK_USED_VAR(k); + return megdnn_mangle(ssprintf( + "A={%zux%zu},B={%zux%zu},C={%zux%zu},Transpose A=%d,Transpose " + "B=%d,ldA=%zu,ldB=%zu,ldC=%zu", + m, k, k, n, m, n, param.transposeA, param.transposeB, + layout_a.stride[0], layout_b.stride[0], layout_c.stride[0])); +} +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/algos.h b/dnn/src/cuda/matrix_mul/algos.h new file mode 100644 index 00000000..8ea190e9 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/algos.h @@ -0,0 +1,164 @@ +/** + * \file dnn/src/cuda/matrix_mul/algos.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "megdnn/oprs.h" +#include "src/common/utils.h" +#include "src/cuda/matrix_mul/opr_impl.h" + +#include +#if CUDA_VERSION >= 10010 +#include +#endif + +namespace megdnn { +namespace cuda { + +/*! + * \brief base class for matrix mul algos + * + */ +class MatrixMulForwardImpl::AlgoBase : public Algorithm { +protected: + ~AlgoBase() = default; + +public: + struct SizeArgs { + MatrixMulForwardImpl* opr; + TensorLayout layout_a, layout_b, layout_c; + + std::string to_string() const; + SizeArgs(MatrixMulForwardImpl* opr, const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C); + + bool can_be_treated_as_int8x8x32() const { + return layout_a.dtype.enumv() == layout_b.dtype.enumv() && + (layout_a.dtype.enumv() == DTypeEnum::Int8 || + layout_a.dtype.enumv() == DTypeEnum::QuantizedS8) && + (layout_c.dtype.enumv() == DTypeEnum::Int32 || + layout_c.dtype.enumv() == DTypeEnum::QuantizedS32) && + opr->param().format == param::MatrixMul::Format::DEFAULT; + } + }; + struct ExecArgs : public SizeArgs { + TensorND tensor_a, tensor_b, tensor_c; + Workspace workspace; + + ExecArgs(MatrixMulForwardImpl* opr, _megdnn_tensor_in A, + _megdnn_tensor_in B, _megdnn_tensor_out C, + _megdnn_workspace workspace); + }; + virtual bool is_available(const SizeArgs& args) const = 0; + virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_wk(const SizeArgs& args, size_t limit) { + return is_available(args) && get_workspace_in_bytes(args) <= limit; + } + bool is_available_reproducible( + const SizeArgs& args, bool reproducible = true, + size_t limit = std::numeric_limits::max()) { + return (!reproducible || is_reproducible()) && + is_available_wk(args, limit); + } + AlgoBase& check_workspace(const SizeArgs& args, + const Workspace& workspace) { + auto req = get_workspace_in_bytes(args); + megdnn_assert( + req <= workspace.size, + "matrix mul fwd algo %s: required workspace %zu bytes, got %zu", + name(), req, workspace.size); + return *this; + } + + +}; + +class MatrixMulForwardImpl::AlgoCuBlas final : public AlgoBase { +public: + AlgoCuBlas() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override { + return 0_z; + } + const char* name() const override { + return "CUBLAS"; + } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { + return true; + } +}; + +#if CUDA_VERSION >= 10000 +class MatrixMulForwardImpl::AlgoUInt4x4x32WMMA final : public AlgoBase { +public: + AlgoUInt4x4x32WMMA() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + const char* name() const override { + return "UINT4x4x32_WMMA"; + } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { + return true; + } +}; +#endif +#if CUDA_VERSION >= 10010 +class MatrixMulForwardImpl::AlgoCuBlasLt final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + const char* name() const override { + return "CUBLAS_LT"; + } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { + return true; + } +}; +#endif + +class MatrixMulForwardImpl::AlgoNaive final : public AlgoBase { +public: + AlgoNaive() = default; + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override { + return 0_z; + } + const char* name() const override { return "NAIVE"; } + void exec(const ExecArgs& args) const override; + bool is_reproducible() const override { return true; } +}; + +class MatrixMulForwardImpl::AlgoPack { + AlgoPack(const AlgoPack&) = delete; + AlgoPack& operator=(const AlgoPack&) = delete; + +public: + AlgoPack(); + AlgoCuBlas cublas; + AlgoNaive naive; +#if CUDA_VERSION >= 10000 + AlgoUInt4x4x32WMMA wmma_uint4x4x32; +#endif +#if CUDA_VERSION >= 10010 + AlgoCuBlasLt cublas_lt; +#endif + + std::vector all_algos; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cublas.cpp b/dnn/src/cuda/matrix_mul/cublas.cpp new file mode 100644 index 00000000..17a9cb65 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cublas.cpp @@ -0,0 +1,144 @@ +/** + * \file dnn/src/cuda/matrix_mul/cublas.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algos.h" + +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include + +using namespace megdnn; +using namespace cuda; + +#if CUDA_VERSION >= 8000 +#define SE_CUDA_DATA_HALF CUDA_R_16F +#else +#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF +#endif + +bool MatrixMulForwardImpl::AlgoCuBlas::is_available( + const SizeArgs& args) const { + if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) + return false; + if (args.layout_a.dtype == dtype::Float32() || + args.layout_a.dtype == dtype::Float16()) { + return true; + } else if (args.layout_a.dtype.enumv() == DTypeEnum::Int8 || + args.layout_a.dtype.enumv() == DTypeEnum::QuantizedS8) { + /** + * \note When passing in the strides which can not be divided by 4, the + * cublas rontine cublasGemmEx will raise a Error + * CUBLAS_STATUS_INVALID_VALUE. The error occured because the leading + * dimension of matrix A or B is illegal. + */ + return args.layout_a.stride[0] % 4 == 0 && + args.layout_b.stride[0] % 4 == 0 && + current_device_prop().major > 5; + } + return false; +} + +void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { + auto&& handle = concrete_handle(args.opr->handle()); + auto&& cublas_handle = handle->cublas_handle(); + auto&& param = args.opr->param(); + size_t m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], + k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; + + auto sgemm = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); + cublas_check(cublasSgemm( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, + args.tensor_b.ptr(), args.tensor_b.layout.stride[0], + args.tensor_a.ptr(), args.tensor_a.layout.stride[0], + zero, args.tensor_c.ptr(), + args.tensor_c.layout.stride[0])); + }; + + auto sgemm_ex = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); +#if CUDART_VERSION >= 9000 + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH)); +#endif + auto sgemm_ex_err = cublasSgemmEx( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, + args.tensor_b.raw_ptr, SE_CUDA_DATA_HALF, + args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, + SE_CUDA_DATA_HALF, args.tensor_a.layout.stride[0], zero, + args.tensor_c.raw_ptr, SE_CUDA_DATA_HALF, + args.tensor_c.layout.stride[0]); + cublas_check(sgemm_ex_err); +#if CUDART_VERSION >= 9000 + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH)); +#endif + }; + + auto hgemm = [&]() { +#if CUDART_VERSION >= 9000 + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH)); +#endif + auto one_half = handle->one_device_h(); + auto zero_half = handle->zero_device_h(); + auto hgemm_ex_err = cublasHgemm( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one_half, + static_cast(args.tensor_b.raw_ptr), + args.tensor_b.layout.stride[0], + static_cast(args.tensor_a.raw_ptr), + args.tensor_a.layout.stride[0], zero_half, + static_cast<__half*>(args.tensor_c.raw_ptr), + args.tensor_c.layout.stride[0]); + cublas_check(hgemm_ex_err); +#if CUDART_VERSION >= 9000 + cublas_check(cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH)); +#endif + }; + + auto igemm = [&]() { + auto zero = handle->zero_device_i32(); + auto one = handle->one_device_i32(); + cublas_check(cublasGemmEx( + cublas_handle, param.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, + param.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, n, m, k, one, + args.tensor_b.raw_ptr, CUDA_R_8I, + args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, + CUDA_R_8I, args.tensor_a.layout.stride[0], zero, + args.tensor_c.raw_ptr, CUDA_R_32I, + args.tensor_c.layout.stride[0], CUDA_R_32I, CUBLAS_GEMM_DFALT)); + }; + + // Note that cublas takes column-major matrices as inputs, + // but megdnn takes row-major ones. + // So we calculate C^t = B^t * A^t by cublas. Here the transpose symbol + // implies row-major to column-major conversion. + if (args.tensor_a.layout.dtype == dtype::Float32()) { + sgemm(); + } else if (args.tensor_a.layout.dtype == dtype::Float16()) { + // use tensor core; note that CUBLAS_TENSOR_OP_MATH also causes + // cublasSgemm to round to fp16, so we can not always enable it + if (handle->device_prop().major >= 6 && + param.compute_mode == Param::ComputeMode::DEFAULT) + hgemm(); + else + sgemm_ex(); + } else if (args.can_be_treated_as_int8x8x32()) { + igemm(); + } else { + megdnn_throw("Unsupported data_type of matrix mul on cuda."); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp new file mode 100644 index 00000000..793a2bf3 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp @@ -0,0 +1,311 @@ +/** + * \file dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/matrix_mul/cublasLt_wrapper.h" +#include "src/common/utils.h" +#include "src/cuda/utils.h" +#if CUDA_VERSION >= 10010 +namespace megdnn { +namespace cuda { +static cudaDataType_t to_cuda_dtype(DType tp) { + switch (tp.enumv()) { + case DTypeEnum::Float16: + return CUDA_R_16F; + case DTypeEnum::Float32: + return CUDA_R_32F; + case DTypeEnum::Int8: + case DTypeEnum::QuantizedS8: + return CUDA_R_8I; + case DTypeEnum::Int32: + case DTypeEnum::QuantizedS32: + return CUDA_R_32I; + default: + megdnn_throw(megdnn_mangle( + "dtype must be float16/float32/int8/qs8/int32")); + } +} +static const char* cuda_type_to_str(cudaDataType_t tp) { + switch (tp) { + case CUDA_R_16F: + return "CUDA_R_16F"; + case CUDA_R_32F: + return "CUDA_R_32F"; + case CUDA_R_8I: + return "CUDA_R_8I"; + case CUDA_R_32I: + return "CUDA_R_32I"; + default: + megdnn_throw( + megdnn_mangle("dtype must be float16/float32/int8/int32")); + } +} +static size_t cuda_dtype_size(cudaDataType_t dt) { + switch (dt) { + case CUDA_R_8I: + return 1_z; + case CUDA_R_16F: + return 2_z; + case CUDA_R_32F: + case CUDA_R_32I: + return 4_z; + default: + megdnn_throw( + megdnn_mangle("dtype must be float16/float32/int8/int32")); + } +} +CUBLASLTMatmulDesc::~CUBLASLTMatmulDesc() { + if (matmul_desc) + cublas_check(cublasLtMatmulDescDestroy(matmul_desc)); + if (layout_a) + cublas_check(cublasLtMatrixLayoutDestroy(layout_a)); + if (layout_b) + cublas_check(cublasLtMatrixLayoutDestroy(layout_b)); + if (layout_c) + cublas_check(cublasLtMatrixLayoutDestroy(layout_c)); + if (layout_trans_a) + cublas_check(cublasLtMatrixLayoutDestroy(layout_trans_a)); + if (layout_trans_b) + cublas_check(cublasLtMatrixLayoutDestroy(layout_trans_b)); + if (layout_trans_c) + cublas_check(cublasLtMatrixLayoutDestroy(layout_trans_c)); +} +void CUBLASLTMatmulDesc::set(const SizeArgs& args, bool batched) { + cublasOperation_t trans_a, trans_b; + auto m = args.layout_c.shape[batched ? 1 : 0], + n = args.layout_c.shape[batched ? 2 : 1]; + auto k = batched ? args.layout_a.shape[args.transposeA ? 1 : 2] + : args.layout_a.shape[args.transposeA ? 0 : 1]; + int batch = (batched ? args.layout_a.shape[0] : 1); + uint32_t pm = CUBLAS_POINTER_MODE_DEVICE; + dt_b = to_cuda_dtype(args.layout_b.dtype); + dt_a = to_cuda_dtype(args.layout_a.dtype); + dt_compute = dt_c = to_cuda_dtype(args.layout_c.dtype); + megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); + cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute)); + cublas_check(cublasLtMatmulDescSetAttribute( + matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); + + cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t order_COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C; + /** + * \NOTE that cublas takes column-major matrices as inputs, + * but megdnn takes row-major ones. + * So we calculate C^t = B^t * A^t by cublas. Here the transpose symbol + * implies row-major to column-major conversion + */ + if (dt_compute == CUDA_R_32I) { + /** + * \NOTE: To use IMMA kernels, use computeType = CUDA_R_32I and + * CUBLASLT_ORDER_COL32 for matrices A,C,D and + * CUBLASLT_ORDER_COL4_4R2_8C for matrix B. + */ + int ldbtransform, ldatransform, ldctransform; + size_t stride_b_trans, stride_a_trans, stride_c_trans; + ldbtransform = 32 * n; + ldatransform = 32 * round_up(m, 8); + ldctransform = 32 * n; + stride_b_trans = round_up(k, 32) / 32 * ldbtransform; + stride_a_trans = round_up(k, 32) / 32 * ldatransform; + stride_c_trans = round_up(m, 32) / 32 * ldctransform; + trans_b = CUBLAS_OP_T; + cublas_check(cublasLtMatmulDescSetAttribute(matmul_desc, + CUBLASLT_MATMUL_DESC_TRANSB, + &trans_b, sizeof(trans_b))); + // origin layout + cublas_check(cublasLtMatrixLayoutCreate( + &layout_b, dt_b, n, k, args.layout_b.stride[batched ? 1 : 0])); + cublas_check(cublasLtMatrixLayoutCreate( + &layout_a, dt_a, k, m, args.layout_a.stride[batched ? 1 : 0])); + cublas_check(cublasLtMatrixLayoutCreate( + &layout_c, dt_c, n, m, args.layout_c.stride[batched ? 1 : 0])); + // transformed layout + cublas_check(cublasLtMatrixLayoutCreate(&layout_trans_b, dt_b, n, k, + ldbtransform)); + cublas_check(cublasLtMatrixLayoutCreate(&layout_trans_a, dt_a, m, k, + ldatransform)); + cublas_check(cublasLtMatrixLayoutCreate(&layout_trans_c, dt_c, n, m, + ldctransform)); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_b, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, + sizeof(order_COL32))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_a, CUBLASLT_MATRIX_LAYOUT_ORDER, + &order_COL4_4R2_8C, sizeof(order_COL4_4R2_8C))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_c, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, + sizeof(order_COL32))); + if (batched) { + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, + sizeof(batch))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, + sizeof(batch))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, + sizeof(batch))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &stride_b_trans, sizeof(stride_b_trans))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &stride_a_trans, sizeof(stride_a_trans))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_trans_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &stride_c_trans, sizeof(stride_c_trans))); + } + workspace_b = batch * cuda_dtype_size(dt_b) * stride_b_trans; + workspace_a = batch * cuda_dtype_size(dt_a) * stride_a_trans; + workspace_c = batch * cuda_dtype_size(dt_c) * stride_c_trans; + } else { + trans_b = args.transposeB ? CUBLAS_OP_T : CUBLAS_OP_N; + trans_a = args.transposeA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublas_check(cublasLtMatmulDescSetAttribute(matmul_desc, + CUBLASLT_MATMUL_DESC_TRANSA, + &trans_b, sizeof(trans_b))); + cublas_check(cublasLtMatmulDescSetAttribute(matmul_desc, + CUBLASLT_MATMUL_DESC_TRANSB, + &trans_a, sizeof(trans_a))); + cublas_check(cublasLtMatrixLayoutCreate( + &layout_b, dt_b, trans_b == CUBLAS_OP_N ? n : k, + trans_b == CUBLAS_OP_N ? k : n, + args.layout_b.stride[batched ? 1 : 0])); + cublas_check(cublasLtMatrixLayoutCreate( + &layout_a, dt_a, trans_a == CUBLAS_OP_N ? k : m, + trans_a == CUBLAS_OP_N ? m : k, + args.layout_a.stride[batched ? 1 : 0])); + cublas_check(cublasLtMatrixLayoutCreate( + &layout_c, dt_c, n, m, args.layout_c.stride[batched ? 1 : 0])); + } + size_t stride_b = args.layout_b.stride[0]; + size_t stride_a = args.layout_a.stride[0]; + size_t stride_c = args.layout_c.stride[0]; + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, + sizeof(batch))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, + sizeof(batch))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, + sizeof(batch))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, + sizeof(stride_b))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, + sizeof(stride_a))); + cublas_check(cublasLtMatrixLayoutSetAttribute( + layout_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, + sizeof(stride_c))); +} +bool CUBLASLTMatmulDesc::is_available(const SizeArgs& args, size_t ws_limit) { + bool support; + cublasLtMatmulAlgo_t algo; + switch (dt_compute) { + case CUDA_R_16F: + support = (dt_a == CUDA_R_16F); + break; + case CUDA_R_32I: { + support = (dt_a == CUDA_R_8I) && + (!args.transposeA && !args.transposeB); + break; + } + case CUDA_R_32F: + support = (dt_a == CUDA_R_16F || dt_a == CUDA_R_32F); + break; + case CUDA_R_64F: /* not support? */ + default: + support = false; + break; + } + support = support && dt_a == dt_b; + support = support && get_algorithm_heuristic(args, ws_limit, algo); + return support; +} +WorkspaceBundle CUBLASLTMatmulDesc::get_workspace_bundle( + const SizeArgs& args, const cublasLtMatmulAlgo_t& algo) { + size_t algo_workspace_size; + auto&& handle = args.handle; + auto&& cublasLt_handle = handle->cublasLt_handle(); + cublasStatus_t status; + cublasLtMatmulHeuristicResult_t result{}; + status = cublasLtMatmulAlgoCheck( + cublasLt_handle, matmul_desc, + dt_compute == CUDA_R_32I ? layout_trans_b : layout_b, + dt_compute == CUDA_R_32I ? layout_trans_a : layout_a, + dt_compute == CUDA_R_32I ? layout_trans_c : layout_c, + dt_compute == CUDA_R_32I ? layout_trans_c : layout_c, &algo, + &result); + // return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed + if (status != CUBLAS_STATUS_SUCCESS) + return {nullptr, {}}; + algo_workspace_size = result.workspaceSize; + return {nullptr, + (dt_compute == CUDA_R_32I) + ? SmallVector{algo_workspace_size, workspace_b, + workspace_a, workspace_c} + : SmallVector{algo_workspace_size}}; +} +bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args, + size_t ws_limit, + cublasLtMatmulAlgo_t& algo) { + bool result; + int return_algo_count; + size_t algo_ws_limit; + cublasStatus_t status; + cublasLtMatmulPreference_t algo_pref; + cublasLtMatmulHeuristicResult_t algo_result{}; + auto&& handle = concrete_handle(args.handle); + auto&& cublasLt_handle = handle->cublasLt_handle(); + + size_t temp = workspace_b + workspace_a + workspace_c; + algo_ws_limit = (ws_limit > temp) ? (ws_limit - temp) : 0; + + /** + * \Note: algo_ws_limit must be zero if cublasLtGetVersion() <= 10100 + */ + // algo_ws_limit = 0; + if (dt_compute == CUDA_R_32I) { + //[FIXME]: cublasLt(Version 10020) produce wrong result when k in + //[64*n+1 , 64*n+32] for small matrix + + //[TODO]: check if this bug is fixed in latter cublasLt. + size_t k_pos = (is_batched ? 1 : 0) + (args.transposeA ? 0 : 1); + size_t k = args.layout_a.shape[k_pos]; + bool flt = (k < 65 || ((k - 1) / 32) % 2 == 1); + if (!flt) + return false; + } + result = false; + cublas_check(cublasLtMatmulPreferenceCreate(&algo_pref)); + cublas_check(cublasLtMatmulPreferenceSetAttribute( + algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit, + sizeof(algo_ws_limit))); + status = cublasLtMatmulAlgoGetHeuristic( + cublasLt_handle, matmul_desc, + dt_compute == CUDA_R_32I ? layout_trans_b : layout_b, + dt_compute == CUDA_R_32I ? layout_trans_a : layout_a, + dt_compute == CUDA_R_32I ? layout_trans_c : layout_c, + dt_compute == CUDA_R_32I ? layout_trans_c : layout_c, algo_pref, 1, + &algo_result, &return_algo_count); + if (status == CUBLAS_STATUS_SUCCESS && return_algo_count > 0 && + // perform cublasLtAlgoCheck() to make sure the algo is correct + get_workspace_bundle(args, algo_result.algo).nr_workspace() > 0) { + result = true; + algo = algo_result.algo; + } + cublas_check(cublasLtMatmulPreferenceDestroy(algo_pref)); + return result; +} +} // namespace cuda +} // namespace megdnn +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.h b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.h new file mode 100644 index 00000000..7f061a92 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.h @@ -0,0 +1,80 @@ +/** + * \file dnn/src/cuda/matrix_mul/cublasLt_wrapper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include "./algos.h" +#include "megdnn/basic_types.h" +#include "megdnn/oprs/nn.h" +#include "src/common/utils.h" +#include "src/cuda/utils.h" +#if CUDA_VERSION >= 10010 +#include +namespace megdnn { +namespace cuda { +struct CUBLASLTMatmulDesc { + struct SizeArgs { + using MMSizeArgs = MatrixMulForwardImpl::AlgoBase::SizeArgs; + HandleImpl* handle; + bool transposeA, transposeB; + TensorLayout layout_a, layout_b, layout_c; + std::string to_string() const; + SizeArgs(HandleImpl* handle, bool transposeA, bool transposeB, + const TensorLayout& A, const TensorLayout& B, + const TensorLayout& C) + : handle(handle), + transposeA(transposeA), + transposeB(transposeB), + layout_a(A), + layout_b(B), + layout_c(C){}; + explicit SizeArgs(const MMSizeArgs& args) + : layout_a(args.layout_a), + layout_b(args.layout_b), + layout_c(args.layout_c) { + handle = concrete_handle(args.opr->handle()); + auto&& param = args.opr->param(); + transposeA = param.transposeA; + transposeB = param.transposeB; + }; + }; + bool is_batched; + cublasLtMatmulDesc_t matmul_desc; + cudaDataType_t dt_a, dt_b, dt_c, dt_compute; + cublasLtMatrixLayout_t layout_a, layout_b, layout_c; + cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; + size_t workspace_a, workspace_b, workspace_c; + CUBLASLTMatmulDesc(const SizeArgs& args, bool batched = false) + : matmul_desc(nullptr), + layout_a(nullptr), + layout_b(nullptr), + layout_c(nullptr), + layout_trans_a(nullptr), + layout_trans_b(nullptr), + layout_trans_c(nullptr), + workspace_a(0), + workspace_b(0), + workspace_c(0) { + is_batched = batched; + set(args, batched); + } + ~CUBLASLTMatmulDesc(); + void set(const SizeArgs& args, bool batched = false); + void reset(); + bool get_algorithm_heuristic(const SizeArgs& args, size_t ws_limit, + cublasLtMatmulAlgo_t& algo); + WorkspaceBundle get_workspace_bundle(const SizeArgs& args, + const cublasLtMatmulAlgo_t& algo); + bool is_available(const SizeArgs& args, size_t ws_limit); +}; +} // namespace cuda +} // namespace megdnn +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/cublas_lt.cpp b/dnn/src/cuda/matrix_mul/cublas_lt.cpp new file mode 100644 index 00000000..42f6bca2 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/cublas_lt.cpp @@ -0,0 +1,145 @@ +/** + * \file dnn/src/cuda/matrix_mul/cublas_lt.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algos.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" +#include "src/cuda/matrix_mul/cublasLt_wrapper.h" +#if CUDA_VERSION >= 10010 +using namespace megdnn; +using namespace cuda; + +bool MatrixMulForwardImpl::AlgoCuBlasLt::is_available( + const SizeArgs &args) const { + if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) + return false; + if (args.layout_a.dtype.enumv() == DTypeEnum::Quantized4Asymm) + return false; + CUBLASLTMatmulDesc::SizeArgs ltArgs(args); + return CUBLASLTMatmulDesc(ltArgs).is_available(ltArgs, INT_MAX); +} +size_t MatrixMulForwardImpl::AlgoCuBlasLt::get_workspace_in_bytes( + const SizeArgs& args) const { + CUBLASLTMatmulDesc::SizeArgs ltArgs(args); + cublasLtMatmulAlgo_t algo; + CUBLASLTMatmulDesc desc(ltArgs); + desc.get_algorithm_heuristic(ltArgs, INT_MAX, algo); + return desc.get_workspace_bundle(ltArgs, algo).total_size_in_bytes(); +} +void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { + CUBLASLTMatmulDesc::SizeArgs ltArgs(args); + cublasLtMatmulAlgo_t algo; + CUBLASLTMatmulDesc desc(ltArgs); + auto&& handle = ltArgs.handle; + auto&& stream = handle->stream(); + auto&& cublasLt_handle = handle->cublasLt_handle(); + desc.get_algorithm_heuristic(ltArgs, INT_MAX, algo); + auto&& ws_bundle = desc.get_workspace_bundle(ltArgs, algo); + ws_bundle.set(args.workspace.raw_ptr); + + auto sgemm = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); + megdnn_assert(ws_bundle.nr_workspace() == 1, + "workspace bundle size should be 1(ws_algo)"); + cublas_check(cublasLtMatmul(cublasLt_handle, + desc.matmul_desc, + one, + static_cast(args.tensor_b.ptr()), desc.layout_b, + static_cast(args.tensor_a.ptr()), desc.layout_a, + zero, + static_cast(args.tensor_c.ptr()), desc.layout_c, + static_cast(args.tensor_c.ptr()), desc.layout_c, + &algo, + ws_bundle.get(0), ws_bundle.get_size(0), + stream + )); + }; + auto hgemm = [&]() { + auto zero_half = handle->zero_device_h(); + auto one_half = handle->one_device_h(); + megdnn_assert(ws_bundle.nr_workspace() == 1, + "workspace bundle size should be 1(ws_algo)"); + cublas_check(cublasLtMatmul(cublasLt_handle, + desc.matmul_desc, + one_half, + static_cast(args.tensor_b.raw_ptr), desc.layout_b, + static_cast(args.tensor_a.raw_ptr), desc.layout_a, + zero_half, + static_cast(args.tensor_c.raw_ptr), desc.layout_c, + static_cast<__half *>(args.tensor_c.raw_ptr), desc.layout_c, + &algo, + ws_bundle.get(0), ws_bundle.get_size(0), + stream + )); + }; + auto igemm = [&]() { + auto zero = handle->zero_device(); + auto one = handle->one_device(); + megdnn_assert(ws_bundle.nr_workspace() == 4, + "workspace bundle size should be 4(ws_algo, ws_a, ws_b, ws_c)"); + void *ws_b = ws_bundle.get(1); + void *ws_a = ws_bundle.get(2); + void *ws_c = ws_bundle.get(3); + int32_t pm=CUBLAS_POINTER_MODE_DEVICE; + cublasOperation_t trans_a=CUBLAS_OP_T, trans_c=CUBLAS_OP_N; + cublasLtMatrixTransformDesc_t transform_desc = nullptr; + cublas_check(cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F)); + cublas_check(cublasLtMatrixTransformDescSetAttribute(transform_desc, + CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &pm, sizeof(pm))); + cublas_check(cublasLtMatrixTransform(cublasLt_handle, transform_desc, + one, args.tensor_b.raw_ptr, desc.layout_b, + zero, nullptr, nullptr, + ws_b, desc.layout_trans_b, + stream)); + cublas_check(cublasLtMatrixTransformDescSetAttribute(transform_desc, + CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_a, sizeof(trans_a))); + cublas_check(cublasLtMatrixTransform(cublasLt_handle, transform_desc, + one, args.tensor_a.raw_ptr, desc.layout_a, + zero, nullptr, nullptr, + ws_a, desc.layout_trans_a, + stream)); + cublas_check(cublasLtMatmul(cublasLt_handle, desc.matmul_desc, + one, + ws_b, desc.layout_trans_b, + ws_a, desc.layout_trans_a, + zero, + ws_c, desc.layout_trans_c, + ws_c, desc.layout_trans_c, + &algo, + ws_bundle.get(0), + ws_bundle.get_size(0), + stream)); + cublas_check(cublasLtMatrixTransformDescSetAttribute(transform_desc, + CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &trans_c, sizeof(trans_c))); + cublas_check(cublasLtMatrixTransform(cublasLt_handle, transform_desc, + one, ws_c, desc.layout_trans_c, + zero, nullptr, nullptr, + args.tensor_c.raw_ptr, desc.layout_c, + stream)); + cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); + }; + switch(desc.dt_compute) { + case CUDA_R_16F: + hgemm(); + break; + case CUDA_R_32F: + sgemm(); + break; + case CUDA_R_32I: + igemm(); + break; + default: + megdnn_throw(megdnn_mangle("compute type must be float16/float32/int32")); + } +} +#endif +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/naive.cpp b/dnn/src/cuda/matrix_mul/naive.cpp new file mode 100644 index 00000000..1ae438f4 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/naive.cpp @@ -0,0 +1,40 @@ +/** + * \file dnn/src/cuda/matrix_mul/naive.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/matrix_mul/naive.cuh" +#include +#include "src/cuda/matrix_mul/algos.h" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +bool MatrixMulForwardImpl::AlgoNaive::is_available(const SizeArgs& args) const { + return args.can_be_treated_as_int8x8x32(); +} +void MatrixMulForwardImpl::AlgoNaive::exec(const ExecArgs& args) const { + auto&& param = args.opr->param(); + auto m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1], + k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1]; + auto LDA = args.tensor_a.layout.stride[0], + LDB = args.tensor_b.layout.stride[0], + LDC = args.tensor_c.layout.stride[0]; + + int8_t* A = args.tensor_a.compatible_ptr(); + int8_t* B = args.tensor_b.compatible_ptr(); + int32_t* C = args.tensor_c.compatible_ptr(); + + auto&& handle = concrete_handle(args.opr->handle()); + exec_gemm_int8_naive(A, B, C, m, n, k, LDA, LDB, LDC, param.transposeA, + param.transposeB, cuda_stream(handle)); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/naive.cu b/dnn/src/cuda/matrix_mul/naive.cu new file mode 100644 index 00000000..05716cb1 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/naive.cu @@ -0,0 +1,50 @@ +/** + * \file dnn/src/cuda/matrix_mul/naive.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include "src/cuda/matrix_mul/naive.cuh" +#include "src/cuda/utils.cuh" + +namespace { +__global__ void do_exec(const int8_t* A, const int8_t* B, int32_t* C, size_t M, + size_t N, size_t K, size_t LDA, size_t LDB, size_t LDC, + bool transA, bool transB) { + size_t m = blockIdx.x; + for (; m < M; m += gridDim.x) { + size_t n = threadIdx.x; + for (; n < N; n += blockDim.x) { + int32_t res = 0; + for (size_t k = 0; k < K; ++k) { + int8_t av = transA ? A[k * LDA + m] : A[m * LDA + k], + bv = transB ? B[n * LDB + k] : B[k * LDB + n]; + res += av * bv; + } + C[m * LDC + n] = res; + } + } +} +} // namespace + +namespace megdnn { +namespace cuda { + +void exec_gemm_int8_naive(const int8_t* A, const int8_t* B, int32_t* C, + size_t M, size_t N, size_t K, size_t LDA, size_t LDB, + size_t LDC, bool transA, bool transB, + cudaStream_t stream) { + do_exec<<<128, 128, 0, stream>>>(A, B, C, M, N, K, LDA, LDB, LDC, transA, + transB); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/naive.cuh b/dnn/src/cuda/matrix_mul/naive.cuh new file mode 100644 index 00000000..350ab181 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/naive.cuh @@ -0,0 +1,25 @@ +/** + * \file dnn/src/cuda/matrix_mul/naive.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +void exec_gemm_int8_naive(const int8_t* A, const int8_t* B, int32_t* C, + size_t m, size_t n, size_t k, size_t ldA, size_t ldB, + size_t ldC, bool transA, bool transB, + cudaStream_t stream); +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/opr_impl.cpp b/dnn/src/cuda/matrix_mul/opr_impl.cpp new file mode 100644 index 00000000..bc52b4ae --- /dev/null +++ b/dnn/src/cuda/matrix_mul/opr_impl.cpp @@ -0,0 +1,83 @@ +/** + * \file dnn/src/cuda/matrix_mul/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/matrix_mul/opr_impl.h" +#include "./algos.h" +#include "src/common/algo_chooser.h" + +#include +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" +#include "src/cuda/matrix_mul/cublasLt_wrapper.h" + +namespace megdnn { +namespace cuda { + +std::vector +MatrixMulForwardImpl::get_all_algorithms(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) { + AlgoBase::SizeArgs args{this, A, B, C}; + return megdnn::get_all_algorithms(args); +} + +MatrixMulForwardImpl::Algorithm* MatrixMulForwardImpl::get_algorithm_heuristic( + const TensorLayout& A, const TensorLayout& B, const TensorLayout& C, + size_t workspace_limit_in_bytes, bool reproducible) { + AlgoBase::SizeArgs args{this, A, B, C}; + if (sm_algo_pack.cublas.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.cublas; + } +#if CUDA_VERSION >= 10010 + if (sm_algo_pack.cublas_lt.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.cublas_lt; + } +#endif + +#if CUDA_VERSION >= 10000 + if (sm_algo_pack.wmma_uint4x4x32.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.wmma_uint4x4x32; + } +#endif + + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.all_algos, args, workspace_limit_in_bytes, + "matrix mul forward"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.all_algos, args, workspace_limit_in_bytes, + "matrix mul forward"); + } +} + +size_t MatrixMulForwardImpl::get_workspace_in_bytes(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) { + AlgoBase::SizeArgs args{this, A, B, C}; + return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); +} + +void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, + _megdnn_tensor_out C, + _megdnn_workspace workspace) { + check_exec(A.layout, B.layout, C.layout, workspace.size); + AlgoBase::ExecArgs args(this, A, B, C, workspace); + auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); + algo->check_workspace(args, workspace).exec(args); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/opr_impl.h b/dnn/src/cuda/matrix_mul/opr_impl.h new file mode 100644 index 00000000..b7ea9361 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/opr_impl.h @@ -0,0 +1,63 @@ +/** + * \file dnn/src/cuda/matrix_mul/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" +#include + +namespace megdnn { +namespace cuda { + +class MatrixMulForwardImpl : public MatrixMulForward { +public: + using MatrixMulForward::MatrixMulForward; + void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, + const TensorLayout&) override; + + bool is_thread_safe() const override { return true; } + + std::vector get_all_algorithms(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C) override; + Algorithm* get_algorithm_heuristic(const TensorLayout& A, + const TensorLayout& B, + const TensorLayout& C, + size_t workspace_limit_in_bytes, + bool reproducible) override; + + const char* get_algorithm_set_name() const override { + return "CUDA MATMUL"; + } + + class AlgoBase; + class AlgoCuBlas; +#if CUDA_VERSION >= 10000 + class AlgoUInt4x4x32WMMA; +#endif +#if CUDA_VERSION >= 10010 + class AlgoCuBlasLt; +#endif + class AlgoNaive; + class AlgoPack; + + static const AlgoPack& algo_pack() { + return sm_algo_pack; + } + +private: + static AlgoPack sm_algo_pack; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma.cpp b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma.cpp new file mode 100644 index 00000000..73ecbc99 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma.cpp @@ -0,0 +1,63 @@ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./algos.h" + +#include "src/cuda/utils.h" +#include "src/cuda/handle.h" +#include "src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.h" + +using namespace megdnn; +using namespace cuda; +using namespace matrix_mul; + +#if CUDA_VERSION >= 10000 +bool MatrixMulForwardImpl::AlgoUInt4x4x32WMMA::is_available( + const SizeArgs& args) const { + if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) + return false; + auto&& device_prop = current_device_prop(); + if (device_prop.major < 7 || + (device_prop.major == 7 && device_prop.minor < 5)) { + return false; + } + auto&& param = args.opr->param(); + if (!param.transposeA && param.transposeB) { + bool available = + args.layout_a.dtype.enumv() == DTypeEnum::Quantized4Asymm && + args.layout_c.dtype.enumv() == DTypeEnum::QuantizedS32; + size_t m = args.layout_c.shape[0], n = args.layout_c.shape[1]; + available &= (m % 8 == 0) && (n % 8 == 0); + available &= (args.layout_a.stride[0] % 2 == 0) && + (args.layout_b.stride[0] % 2 == 0); + return available; + } + return false; +} + +size_t MatrixMulForwardImpl::AlgoUInt4x4x32WMMA::get_workspace_in_bytes( + const SizeArgs& args) const { + size_t m = args.layout_c.shape[0], n = args.layout_c.shape[1]; + return (m + n) * sizeof(int32_t); +} + +void MatrixMulForwardImpl::AlgoUInt4x4x32WMMA::exec(const ExecArgs& args) const { + auto&& handle = concrete_handle(args.opr->handle()); + auto&& param = args.opr->param(); + if (!param.transposeA && param.transposeB) { + exec_wmma_matrix_mul_quint4_nt(args.tensor_a, args.tensor_b, + args.tensor_c, args.workspace, + handle->stream()); + } +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu new file mode 100644 index 00000000..c27d77a4 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu @@ -0,0 +1,205 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./preprocess_quantize_sum.cuh" + +#include +#include + +#include "src/cuda/cub/util_ptx.cuh" +#include "src/cuda/utils.cuh" + +namespace { + +template +__global__ void reduce_column_with_scale_u4(const uint8_t* src, int32_t scale, + int rows, int cols_int32, + int ld_in_bytes, + int nr_thread_per_row_log2, + int sm_width_in_bytes, + int32_t* dst) { + constexpr int warp_size = 32; + extern __shared__ uint8_t sub_block_raw[]; + + uint32_t nr_row_per_block = 1 << (block_size_log2 - nr_thread_per_row_log2), + nr_threads_per_row = 1 << nr_thread_per_row_log2, + row_num = threadIdx.x >> nr_thread_per_row_log2, + tid = threadIdx.x - (row_num << nr_thread_per_row_log2), + row_idx = blockIdx.x * nr_row_per_block + row_num; + if (row_idx >= rows) + return; + + volatile int32_t* row = + (int32_t*)(sub_block_raw + row_num * sm_width_in_bytes); + const int32_t* sptr = (const int32_t*)(src + row_idx * ld_in_bytes); + sptr += tid; + int32_t local = 0; + for (int i = tid; i < cols_int32; i += nr_threads_per_row) { + int32_t val = (*sptr); +#pragma unroll + for (int j = 0; j < 8; j++) { + local += (val & 0xF); + val = (val >> 4); + } + sptr += nr_threads_per_row; + } + row[tid] = local; + +#pragma unroll + for (int i = max_nr_threads_per_row / 2; i; i >>= 1) { + bool cond = nr_threads_per_row >= (i * 2) && tid < i; + if (i >= warp_size) { + __syncthreads(); + } else { + cub::WARP_SYNC(0xffffffff); + } + if (cond) { + row[tid] += row[tid + i]; + } + } + if (!tid) { + int32_t* dptr = dst + row_idx; + *dptr = row[0] * scale; + } +} + +template +__global__ void span_qsum(const int32_t* qSumA, const uint32_t M, + const int32_t* qSumB, const uint32_t N, int32_t* dst, + const uint32_t strd, const int32_t scaler_bias) { + constexpr size_t mm = (BY + TY - 1) / TY; + constexpr size_t nn = (BX + TX - 1) / TX; + +#pragma unroll + for (int i = 0; i < mm; ++i) { +#pragma unroll + for (int j = 0; j < nn; ++j) { + int gtidx = threadIdx.x + TX * j + blockIdx.x * BX; + int gtidy = threadIdx.y + TY * i + blockIdx.y * BY; + if (gtidx < N && gtidy < M) { + dst[gtidy * strd + gtidx] += + qSumA[gtidy] + qSumB[gtidx] + scaler_bias; + } + } + } +} + +template +void _do_dispatch_reduce_column_with_scale_u4(const uint8_t* src, int32_t scale, + int rows, int cols_int32, + int ld_in_bytes, int32_t* dst, + cudaStream_t stream) { + constexpr int warp_size = 32; + int block_size = 1 << block_size_log2; + int nr_thread_per_row = 1, nr_thread_per_row_log2 = 0; + + while (nr_thread_per_row < max_nr_threads_per_row && + nr_thread_per_row * 2 < cols_int32) { + ++nr_thread_per_row_log2; + nr_thread_per_row *= 2; + } + // now: nr_thread_per_row <= B < nr_thread_per_row * 2 + + if (cols_int32 <= max_nr_threads_per_row * 4) { + // find nr_thread_per_row with minimal wasted threads + int min_cost = std::numeric_limits::max(), min_cost_th = 0; + for (int i = warp_size; i <= nr_thread_per_row; i *= 2) { + int cost = (i - cols_int32 % i) % i; + if (cost < min_cost) { + min_cost = cost; + min_cost_th = i; + } + } + if (min_cost_th) { + nr_thread_per_row = min_cost_th; + while ((1 << nr_thread_per_row_log2) != nr_thread_per_row) + --nr_thread_per_row_log2; + } + } + + int nr_row_per_block = block_size / nr_thread_per_row, + nr_blk = DIVUP(rows, nr_row_per_block), + sm_width_word32 = nr_thread_per_row; + + // gcd(sm_width_word32, BANKS) should be 1 to avoid bank confliction + // iff sm_width_word32 is odd + sm_width_word32 += !(sm_width_word32 % 2); + int sm_width_in_bytes = sm_width_word32 * 4, + sm_size = nr_row_per_block * sm_width_in_bytes; + + void (*kptr)(const uint8_t* src, int32_t scale, int rows, int cols_int32, + int ld_in_bytes, int nr_thread_per_row_log2, + int sm_width_in_bytes, int32_t* dst); + if (nr_thread_per_row <= max_nr_threads_per_row / 4) { + kptr = reduce_column_with_scale_u4; + } else if (nr_thread_per_row <= max_nr_threads_per_row / 2) { + kptr = reduce_column_with_scale_u4; + } else { + kptr = reduce_column_with_scale_u4; + } + kptr<<>>( + src, scale, rows, cols_int32, ld_in_bytes, nr_thread_per_row_log2, + sm_width_in_bytes, dst); + after_kernel_launch(); +} + +} // namespace + +void megdnn::cuda::exec_reduce_sum_with_scale_uint4( + const uint8_t* A, int32_t scale, uint32_t M, uint32_t K, + uint32_t ldA_in_byte, int32_t* dst, cudaStream_t stream) { + _do_dispatch_reduce_column_with_scale_u4<7, 64>(A, scale, M, K / 8, + ldA_in_byte, dst, stream); +} + +void megdnn::cuda::exec_span_qsum(const int32_t* qSumA, const uint32_t M, + const int32_t* qSumB, const uint32_t N, + int32_t* dst, const uint32_t strd, + const int32_t scaler_bias, + cudaStream_t stream) { + constexpr size_t TX = 32, TY = 32; + constexpr size_t BX = 32, BY = 32; + dim3 nthreads{TX, TY}; + dim3 nblocks{static_cast(DIVUP(N, BX)), + static_cast(DIVUP(M, BY))}; + span_qsum<<>>(qSumA, M, qSumB, N, dst, strd, + scaler_bias); + after_kernel_launch(); +} + +// vim: ft=cpp syntax=cuda.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cuh b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cuh new file mode 100644 index 00000000..f24724bc --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cuh @@ -0,0 +1,53 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +void exec_reduce_sum_with_scale_uint4(const uint8_t* A, int32_t scale, + uint32_t M, uint32_t K, + uint32_t ldA_in_byte, int32_t* dst, + cudaStream_t stream); + +void exec_span_qsum(const int32_t* qSumA, const uint32_t M, + const int32_t* qSumB, const uint32_t N, int32_t* dst, + const uint32_t strd, const int32_t scaler_bias, + cudaStream_t stream); +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp new file mode 100644 index 00000000..50216b56 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp @@ -0,0 +1,45 @@ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./wmma_matrix_mul.h" +#include "./preprocess_quantize_sum.cuh" +#include "./wmma_matrix_mul_u4.cuh" +#include "src/cuda/utils.h" + +#include + +using namespace megdnn; +using namespace cuda; + +#if CUDA_VERSION >= 10000 +void megdnn::cuda::matrix_mul::exec_wmma_matrix_mul_quint4_nt( + _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, + _megdnn_workspace workspace, cudaStream_t stream) { + int32_t M = C.layout.shape[0], N = C.layout.shape[1], K = A.layout.shape[1]; + int32_t ldA = A.layout.stride[0], ldB = B.layout.stride[0], + ldC = C.layout.stride[0]; + int32_t zA = A.layout.dtype.param().zero_point, + zB = B.layout.dtype.param().zero_point; + exec_reduce_sum_with_scale_uint4(static_cast(A.raw_ptr), -zB, M, + K, ldA / 2, workspace.ptr(), + stream); + exec_reduce_sum_with_scale_uint4(static_cast(B.raw_ptr), -zA, N, + K, ldB / 2, workspace.ptr() + M, + stream); + exec_wmma_gemm_u4( + static_cast(A.raw_ptr), static_cast(B.raw_ptr), + C.compatible_ptr(), M, N, K, ldA, ldB, ldC, stream); + exec_span_qsum(workspace.ptr(), M, workspace.ptr() + M, N, + C.compatible_ptr(), ldC, K * zA * zB, stream); +} +#endif // CUDA_VERSION + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.h b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.h new file mode 100644 index 00000000..2e218742 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.h @@ -0,0 +1,26 @@ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { +namespace matrix_mul { +void exec_wmma_matrix_mul_quint4_nt(_megdnn_tensor_in A, _megdnn_tensor_in B, + _megdnn_tensor_out C, + _megdnn_workspace workspace, + cudaStream_t stream); +} // namespace matrix_mul +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu new file mode 100644 index 00000000..0fb02c1e --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu @@ -0,0 +1,365 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/utils.cuh" + +#include +#if CUDA_VERSION >= 10000 + +#if __CUDA_ARCH__ >= 730 +#include +using namespace nvcuda; +using namespace wmma::experimental::precision; +#endif + +namespace wmma_matrix_mul_u4 { + +constexpr size_t WMMA_M = 8; +constexpr size_t WMMA_N = 8; +constexpr size_t WMMA_K = 32; +constexpr size_t WARP_SIZE = 32; + +template +struct BlockConfig { + static const size_t WARP_X = WARP_X_; + static const size_t WARP_Y = WARP_Y_; + static const size_t ROW_PER_WARP = ROW_PER_WARP_; + static const size_t COL_PER_WARP = COL_PER_WARP_; + static const size_t BK = 256; + static const size_t BM = (WARP_Y * WMMA_M * ROW_PER_WARP); + static const size_t BN = (WARP_X * WMMA_N * COL_PER_WARP); + static const size_t WARPS_PER_BLOCK = WARP_X * WARP_Y; +}; + +template +struct GlobalToShareMemStreamConfig { + static const size_t BlockSize = BlockSize_; + static const size_t CACHE_SIZE = + (BlockSize + BlockConfig_::WARPS_PER_BLOCK - 1) / + BlockConfig_::WARPS_PER_BLOCK; + static const size_t SMEM_ROW = BlockSize; + static const size_t SMEM_COL = BlockConfig_::BK; + static const size_t SMEM_SKEW = + WMMA_K * ((BlockConfig_::BK / WMMA_K) % 2 == 0); + static const size_t SMEM_STRIDE = SMEM_COL + SMEM_SKEW; +}; + +#if __CUDA_ARCH__ >= 730 +template +struct GlobalToShareMemStream { + MEGDNN_STATIC_ASSERT(GlobalToShareMemStreamConfig_::BlockSize == + GlobalToShareMemStreamConfig_::CACHE_SIZE * BlockConfig_::WARPS_PER_BLOCK, + "Block size mismatch"); + + uint8_t* smem; + const uint8_t* g_ptr; + int ld; + int row_remain; + int k_base; + int K; + + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + const int idx_in_warp = threadIdx.x % WARP_SIZE; + const int warp_id = warp_y * BlockConfig_::WARP_X + warp_x; + + typedef int32_t copy_t; + copy_t reg_cache[GlobalToShareMemStreamConfig_::CACHE_SIZE]; + + __device__ GlobalToShareMemStream(uint8_t* smem, const uint8_t* g_ptr, + int ld, int row_remain, int K) + : smem{smem}, g_ptr{g_ptr}, ld{ld}, row_remain{row_remain}, K{K} { + k_base = 0; + } + + __device__ __forceinline__ void copy() { + int col = k_base + idx_in_warp * 8; +#pragma unroll + for (int i = 0; i < GlobalToShareMemStreamConfig_::CACHE_SIZE; i++) { + int row = i * BlockConfig_::WARPS_PER_BLOCK + warp_id; + bool cond = row < row_remain && col < K; + if (cond) { + copy_t val = *(copy_t*)(&g_ptr[(row * ld + col) / 2]); + reg_cache[i] = val; + } else { + reg_cache[i] = 0; + } + } + } + + __device__ __forceinline__ void commit() { + int col = idx_in_warp * 8; +#pragma unroll + for (int i = 0; i < GlobalToShareMemStreamConfig_::CACHE_SIZE; i++) { + int row = i * BlockConfig_::WARPS_PER_BLOCK + warp_id; + *(copy_t*)(get_smem_ptr(row, col)) = reg_cache[i]; + } + } + + __device__ __forceinline__ uint8_t* get_smem_ptr(int y, int x) { + return &smem[(y * GlobalToShareMemStreamConfig_::SMEM_STRIDE + x) / 2]; + } + + __device__ __forceinline__ void inc_stage() { + k_base += GlobalToShareMemStreamConfig_::SMEM_COL; + } +}; + +template +__device__ inline void load_share_mem( + wmma::fragment + a_frag[BlockConfig_::ROW_PER_WARP], + wmma::fragment + b_frag[BlockConfig_::COL_PER_WARP], + GlobalToShareMemStream< + BlockConfig_, + GlobalToShareMemStreamConfig>& + gbl2smem_a, + GlobalToShareMemStream< + BlockConfig_, + GlobalToShareMemStreamConfig>& + gbl2smem_b, + int warp_k) { + typedef GlobalToShareMemStreamConfig + Config_A; + typedef GlobalToShareMemStreamConfig + Config_B; + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + uint8_t* __restrict__ s_ptr_a = + gbl2smem_a.get_smem_ptr(warp_y * WMMA_M, warp_k * WMMA_K); + uint8_t* __restrict__ s_ptr_b = + gbl2smem_b.get_smem_ptr(warp_x * WMMA_N, warp_k * WMMA_K); + + const int stride_a = BlockConfig_::WARP_Y * WMMA_M; + const int stride_b = BlockConfig_::WARP_X * WMMA_N; +#pragma unroll + for (int i = 0; i < BlockConfig_::ROW_PER_WARP; ++i) { + wmma::load_matrix_sync( + a_frag[i], s_ptr_a + i * stride_a * Config_A::SMEM_STRIDE / 2, + Config_A::SMEM_STRIDE); + } +#pragma unroll + for (int j = 0; j < BlockConfig_::COL_PER_WARP; ++j) { + wmma::load_matrix_sync( + b_frag[j], s_ptr_b + j * stride_b * Config_B::SMEM_STRIDE / 2, + Config_B::SMEM_STRIDE); + } +} + +template +__device__ inline void +calc(wmma::fragment + a_frag[ROW_PER_WARP], + wmma::fragment + b_frag[COL_PER_WARP], + wmma::fragment + acc_frag[ROW_PER_WARP][COL_PER_WARP]) { +#pragma unroll + for (int i = 0; i < ROW_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < COL_PER_WARP; ++j) { + wmma::mma_sync(acc_frag[i][j], a_frag[i], b_frag[j], + acc_frag[i][j]); + } + } +} + +template +__device__ void inline consume_tile( + GlobalToShareMemStream< + BlockConfig_, + GlobalToShareMemStreamConfig>& + gbl2smem_a, + GlobalToShareMemStream< + BlockConfig_, + GlobalToShareMemStreamConfig>& + gbl2smem_b, + wmma::fragment + a_frag[2][BlockConfig_::ROW_PER_WARP], + wmma::fragment + b_frag[2][BlockConfig_::COL_PER_WARP], + wmma::fragment + acc_frag[BlockConfig_::ROW_PER_WARP] + [BlockConfig_::COL_PER_WARP]) { + if (!last_block) { + gbl2smem_a.inc_stage(); + gbl2smem_b.inc_stage(); + gbl2smem_a.copy(); + gbl2smem_b.copy(); + } + int warp_k = 0; +#pragma unroll + for (warp_k = 0; warp_k < BlockConfig_::BK / WMMA_K - 1; ++warp_k) { + load_share_mem(a_frag[(warp_k + 1) % 2], + b_frag[(warp_k + 1) % 2], gbl2smem_a, + gbl2smem_b, warp_k + 1); + calc( + a_frag[warp_k % 2], b_frag[warp_k % 2], acc_frag); + } + calc( + a_frag[warp_k % 2], b_frag[warp_k % 2], acc_frag); + if (!last_block) { + __syncthreads(); + gbl2smem_a.commit(); + gbl2smem_b.commit(); + __syncthreads(); + load_share_mem(a_frag[0], b_frag[0], gbl2smem_a, + gbl2smem_b, 0); + } +} + +template +__global__ void u4_gemm_template_device_nt(const uint8_t* A, const uint8_t* B, + int32_t* C, int M, int N, int K, + int lda, int ldb, int ldc) { + typedef GlobalToShareMemStreamConfig + Config_A; + typedef GlobalToShareMemStreamConfig + Config_B; + __shared__ uint8_t smem_a[BlockConfig_::BM][Config_A::SMEM_STRIDE / 2]; + __shared__ uint8_t smem_b[BlockConfig_::BN][Config_B::SMEM_STRIDE / 2]; + + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const uint8_t* g_ptr_a = A + bidy * BlockConfig_::BM * lda / 2; + const uint8_t* g_ptr_b = B + bidx * BlockConfig_::BN * ldb / 2; + const int warp_x = threadIdx.x / WARP_SIZE; + const int warp_y = threadIdx.y; + + const int warp_row_start = bidy * BlockConfig_::BM + warp_y * WMMA_M; + const int warp_col_start = bidx * BlockConfig_::BN + warp_x * WMMA_N; + int32_t* g_ptr_c = C + warp_row_start * ldc + warp_col_start; + + GlobalToShareMemStream gbl2smem_a( + &smem_a[0][0], g_ptr_a, lda, M - bidy, K); + GlobalToShareMemStream gbl2smem_b( + &smem_b[0][0], g_ptr_b, ldb, N - bidx, K); + + wmma::fragment + acc_frag[BlockConfig_::ROW_PER_WARP][BlockConfig_::COL_PER_WARP]; + wmma::fragment + a_frag[2][BlockConfig_::ROW_PER_WARP]; + wmma::fragment + b_frag[2][BlockConfig_::COL_PER_WARP]; + +#pragma unroll + for (int i = 0; i < BlockConfig_::ROW_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < BlockConfig_::COL_PER_WARP; ++j) { + wmma::fill_fragment(acc_frag[i][j], 0); + } + } + + gbl2smem_a.copy(); + gbl2smem_b.copy(); + gbl2smem_a.commit(); + gbl2smem_b.commit(); + + __syncthreads(); + + load_share_mem(a_frag[0], b_frag[0], gbl2smem_a, gbl2smem_b, 0); + + const int BLK_K = (K + BlockConfig_::BK - 1) / BlockConfig_::BK; +#pragma unroll 1 + for (int blk_k = 0; blk_k < BLK_K - 1; ++blk_k) { + consume_tile(gbl2smem_a, gbl2smem_b, a_frag, + b_frag, acc_frag); + } + consume_tile(gbl2smem_a, gbl2smem_b, a_frag, b_frag, + acc_frag); + +#pragma unroll + for (int i = 0; i < BlockConfig_::ROW_PER_WARP; ++i) { +#pragma unroll + for (int j = 0; j < BlockConfig_::COL_PER_WARP; ++j) { + if (warp_row_start + i * BlockConfig_::WARP_Y * WMMA_M <= + M - WMMA_M && + warp_col_start + j * BlockConfig_::WARP_X * WMMA_N <= + N - WMMA_N) { + wmma::store_matrix_sync( + &g_ptr_c[(i * BlockConfig_::WARP_Y * WMMA_M) * ldc + + (j * BlockConfig_::WARP_X * WMMA_N)], + acc_frag[i][j], ldc, wmma::mem_row_major); + } + } + } +} +#else +template +__global__ void u4_gemm_template_device_nt(const uint8_t* /*A*/, + const uint8_t* /*B*/, int32_t* /*C*/, + int /*M*/, int /*N*/, int /*K*/, + int /*lda*/, int /*ldb*/, + int /*ldc*/) {} +#endif + +void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B, + int32_t* C, int M, int N, int K, int lda, + int ldb, int ldc, cudaStream_t stream) { + constexpr size_t warp_x = 4; + constexpr size_t warp_y = 4; + constexpr size_t row_per_warp = 4; + constexpr size_t col_per_warp = 4; + typedef BlockConfig + BlockConfig_; + dim3 block{warp_x * WARP_SIZE, warp_y}; + dim3 grid{static_cast(DIVUP(N, BlockConfig_::BN)), + static_cast(DIVUP(M, BlockConfig_::BM))}; + u4_gemm_template_device_nt + <<>>(A, B, C, M, N, K, lda, ldb, ldc); + after_kernel_launch(); +} +} // namespace wmma_matrix_mul_u4 + +namespace megdnn { +namespace cuda { +void exec_wmma_gemm_u4(const uint8_t* A, const uint8_t* B, int32_t* C, int M, + int N, int K, int lda, int ldb, int ldc, + cudaStream_t stream) { + wmma_matrix_mul_u4::_do_dispatch_wmma_matrix_mul_u4(A, B, C, M, N, K, lda, + ldb, ldc, stream); +} +} // namespace cuda +} // namespace megdnn + +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cuh b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cuh new file mode 100644 index 00000000..14328838 --- /dev/null +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cuh @@ -0,0 +1,46 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** + * \file dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +void exec_wmma_gemm_u4(const uint8_t* A, const uint8_t* B, int32_t* C, int M, + int N, int K, int ldA, int ldB, int ldC, cudaStream_t stream); +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/max_tensor_diff/opr_impl.cpp b/dnn/src/cuda/max_tensor_diff/opr_impl.cpp new file mode 100644 index 00000000..ccf7e94a --- /dev/null +++ b/dnn/src/cuda/max_tensor_diff/opr_impl.cpp @@ -0,0 +1,23 @@ +/** + * \file dnn/src/cuda/max_tensor_diff/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/max_tensor_diff/opr_impl.h" +#include "src/common/utils.h" + +using namespace megdnn; +using namespace cuda; + +float MaxTensorDiffImpl::exec(_megdnn_tensor_in, _megdnn_tensor_in, + _megdnn_workspace) { + megdnn_throw("MaxTensorDiff not support in cuda"); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/max_tensor_diff/opr_impl.h b/dnn/src/cuda/max_tensor_diff/opr_impl.h new file mode 100644 index 00000000..e0e915cc --- /dev/null +++ b/dnn/src/cuda/max_tensor_diff/opr_impl.h @@ -0,0 +1,35 @@ +/** + * \file dnn/src/cuda/max_tensor_diff/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class MaxTensorDiffImpl final : public MaxTensorDiff { +public: + using MaxTensorDiff::MaxTensorDiff; + + bool is_thread_safe() const override { return true; } + + size_t get_workspace_in_bytes(const TensorLayout&, + const TensorLayout&) override { + return 0; + }; + + float exec(_megdnn_tensor_in src1, _megdnn_tensor_in src2, + _megdnn_workspace workspace) override; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/megcore/cuda_computing_context.cpp b/dnn/src/cuda/megcore/cuda_computing_context.cpp new file mode 100644 index 00000000..d12976bf --- /dev/null +++ b/dnn/src/cuda/megcore/cuda_computing_context.cpp @@ -0,0 +1,76 @@ +/** + * \file dnn/src/cuda/megcore/cuda_computing_context.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megcore.h" + +#include "src/common/utils.h" +#include "src/cuda/utils.h" + + +#include "./cuda_computing_context.hpp" + +using namespace megcore; +using namespace megcore::cuda; + +CUDAComputingContext::CUDAComputingContext(megcoreDeviceHandle_t dev_handle, + unsigned int flags, const CudaContext& ctx): + ComputingContext(dev_handle, flags), + own_stream_{ctx.stream == nullptr}, + context_{ctx} +{ + megcorePlatform_t platform; + megcoreGetPlatform(dev_handle, &platform); + megdnn_assert(platform == megcorePlatformCUDA); + if (own_stream_) { + cuda_check(cudaStreamCreateWithFlags(&context_.stream, + cudaStreamNonBlocking)); + } +} + +CUDAComputingContext::~CUDAComputingContext() +{ + if (own_stream_) { + cuda_check(cudaStreamDestroy(context_.stream)); + } +} + +void CUDAComputingContext::memcpy(void *dst, const void *src, + size_t size_in_bytes, megcoreMemcpyKind_t kind) +{ + cudaMemcpyKind cuda_kind; + switch (kind) { + case megcoreMemcpyDeviceToHost: + cuda_kind = cudaMemcpyDeviceToHost; + break; + case megcoreMemcpyHostToDevice: + cuda_kind = cudaMemcpyHostToDevice; + break; + case megcoreMemcpyDeviceToDevice: + cuda_kind = cudaMemcpyDeviceToDevice; + break; + default: + megdnn_throw("bad cuda memcpy kind"); + } + cuda_check(cudaMemcpyAsync(dst, src, size_in_bytes, cuda_kind, + context_.stream)); +} + +void CUDAComputingContext::memset(void *dst, int value, size_t size_in_bytes) +{ + cuda_check(cudaMemsetAsync(dst, value, size_in_bytes, context_.stream)); +} + +void CUDAComputingContext::synchronize() +{ + cuda_check(cudaStreamSynchronize(context_.stream)); +} + + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/megcore/cuda_computing_context.hpp b/dnn/src/cuda/megcore/cuda_computing_context.hpp new file mode 100644 index 00000000..b821612e --- /dev/null +++ b/dnn/src/cuda/megcore/cuda_computing_context.hpp @@ -0,0 +1,47 @@ +/** + * \file dnn/src/cuda/megcore/cuda_computing_context.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/common/megcore/common/computing_context.hpp" +#include "megcore_cuda.h" +#include + +namespace megcore { +namespace cuda { + +class CUDAComputingContext final: public ComputingContext { + public: + CUDAComputingContext(megcoreDeviceHandle_t dev_handle, + unsigned int flags, const CudaContext &ctx = {}); + ~CUDAComputingContext(); + + void memcpy(void *dst, const void *src, size_t size_in_bytes, + megcoreMemcpyKind_t kind) override; + void memset(void *dst, int value, size_t size_in_bytes) override; + void synchronize() override; + + const CudaContext& context() const { + return context_; + } + + cudaStream_t stream() const { + return context().stream; + } + + private: + bool own_stream_; + CudaContext context_; +}; + +} // namespace cuda +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/megcore/cuda_device_context.cpp b/dnn/src/cuda/megcore/cuda_device_context.cpp new file mode 100644 index 00000000..c82b3282 --- /dev/null +++ b/dnn/src/cuda/megcore/cuda_device_context.cpp @@ -0,0 +1,67 @@ +/** + * \file dnn/src/cuda/megcore/cuda_device_context.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megcore.h" +#include "src/common/utils.h" +#include "src/cuda/utils.h" + +#include "./cuda_device_context.hpp" + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#pragma message "compile with cuda " STR(CUDART_VERSION) " " + +using namespace megcore; +using namespace cuda; + +CUDADeviceContext::CUDADeviceContext(int device_id, unsigned int flags): + DeviceContext(megcorePlatformCUDA, device_id, flags) +{ + int version; + cuda_check(cudaRuntimeGetVersion(&version)); + megdnn_assert(version == CUDART_VERSION, + "megcore compiled with cuda %d, get %d at runtime", + CUDART_VERSION, version); + int id = device_id; + if (id < 0) { + cuda_check(cudaGetDevice(&id)); + } + cuda_check(cudaGetDeviceProperties(&prop_, id)); +} + +CUDADeviceContext::~CUDADeviceContext() noexcept = default; + +size_t CUDADeviceContext::mem_alignment_in_bytes() const noexcept { + return std::max(prop_.textureAlignment, prop_.texturePitchAlignment); +} + +void CUDADeviceContext::activate() +{ + int id = device_id(); + if (id >= 0) { + cuda_check(cudaSetDevice(id)); + } +} + +void *CUDADeviceContext::malloc(size_t size_in_bytes) +{ + void *ptr; + cuda_check(cudaMalloc(&ptr, size_in_bytes)); + return ptr; +} + +void CUDADeviceContext::free(void *ptr) +{ + cuda_check(cudaFree(ptr)); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/megcore/cuda_device_context.hpp b/dnn/src/cuda/megcore/cuda_device_context.hpp new file mode 100644 index 00000000..6fe9e2d7 --- /dev/null +++ b/dnn/src/cuda/megcore/cuda_device_context.hpp @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/megcore/cuda_device_context.hpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/common/megcore/common/device_context.hpp" +#include + +namespace megcore { +namespace cuda { + +class CUDADeviceContext: public DeviceContext { + public: + CUDADeviceContext(int device_id, unsigned int flags); + ~CUDADeviceContext() noexcept; + + size_t mem_alignment_in_bytes() const noexcept override; + + void activate() override; + void *malloc(size_t size_in_bytes) override; + void free(void *ptr) override; + private: + cudaDeviceProp prop_; +}; + +} // namespace cuda +} // namespace megcore + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/megcore/public_api/computing.cpp b/dnn/src/cuda/megcore/public_api/computing.cpp new file mode 100644 index 00000000..e3f90227 --- /dev/null +++ b/dnn/src/cuda/megcore/public_api/computing.cpp @@ -0,0 +1,49 @@ +/** + * \file dnn/src/cuda/megcore/public_api/computing.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megcore_cuda.h" + +#include "src/common/utils.h" +#include "src/common/megcore/public_api/computing.hpp" +#include "../cuda_computing_context.hpp" + +using namespace megcore; + +megcoreStatus_t megcore::createComputingHandleWithCUDAContext( + megcoreComputingHandle_t *compHandle, + megcoreDeviceHandle_t devHandle, + unsigned int flags, + const CudaContext& ctx) +{ + auto content = megdnn::make_unique( + devHandle, flags, ctx); + auto &H = *compHandle; + H = new megcoreComputingContext; + H->content = std::move(content); + return megcoreSuccess; +} + +megcoreStatus_t megcore::getCUDAContext(megcoreComputingHandle_t handle, + CudaContext* ctx) +{ + auto &&H = handle; + megdnn_assert(H); + megcoreDeviceHandle_t dev_handle = H->content->dev_handle(); + megcorePlatform_t platform; + megcoreGetPlatform(dev_handle, &platform); + megdnn_assert(platform == megcorePlatformCUDA); + auto context = static_cast( + H->content.get()); + *ctx = context->context(); + return megcoreSuccess; +} + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/mesh_indexing/mesh_indexing.cu b/dnn/src/cuda/mesh_indexing/mesh_indexing.cu new file mode 100644 index 00000000..16caa079 --- /dev/null +++ b/dnn/src/cuda/mesh_indexing/mesh_indexing.cu @@ -0,0 +1,83 @@ +/** + * \file dnn/src/cuda/mesh_indexing/mesh_indexing.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/basic_types.h" +#include "megdnn/dtype.h" +#include "src/common/indexing_multi_axis_vec_kdef.h" +#include "src/cuda/indexing_multi_axis_vec/kern.cuh" +#include "src/cuda/mesh_indexing/mesh_indexing.cuh" +#include "src/cuda/utils.cuh" + +#define KERN_APPLY_OPR_INDEXING ::megdnn::indexing_multi_axis_vec_kdef::OprFwd + +#define KERN_APPLY_OPR_INCR \ + ::megdnn::cuda::indexing_multi_axis_vec::OprAtomicIncr + +#define KERN_APPLY_OPR_SET ::megdnn::indexing_multi_axis_vec_kdef::OprSet + +namespace { + +using namespace megdnn; +using namespace cuda; +using namespace mesh_indexing; + +template +__global__ void mesh_indexing_general_kernel(T* src, T* dst, + const KernIndexer indexer) { + uint32_t dst_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (dst_idx < indexer.size) { + int src_idx = indexer.convert_indxer(dst_idx); + Opr::apply(src[src_idx], dst[dst_idx]); + } +} +} // namespace + +namespace megdnn { +namespace cuda { +namespace mesh_indexing { + +template +void mesh_indexing_proxy(T* src, T* dst, KernIndexer* indexer, + cudaStream_t stream) { + mesh_indexing_general_kernel + <<size, NR_THREADS), NR_THREADS, 0, stream>>>( + src, dst, *indexer); +} + +#define INST(_ctype) \ + template void mesh_indexing_proxy<_ctype, KERN_APPLY_OPR_INDEXING>( \ + _ctype * src, _ctype * dst, KernIndexer * indexer, \ + cudaStream_t stream); \ + \ + template void mesh_indexing_proxy<_ctype, KERN_APPLY_OPR_SET>( \ + _ctype * src, _ctype * dst, KernIndexer * indexer, \ + cudaStream_t stream); + +#define INST_ATOMIC_ADD(_ctype) \ + template void mesh_indexing_proxy<_ctype, KERN_APPLY_OPR_INCR>( \ + _ctype * src, _ctype * dst, KernIndexer * indexer, \ + cudaStream_t stream); + +#define cb(_dtype) INST(DTypeTrait<_dtype>::ctype) + +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + +#define cb(_dtype) INST_ATOMIC_ADD(DTypeTrait<_dtype>::ctype) + +cb(dtype::Float32); +cb(dtype::Int32) +#undef cb + +#undef INST +} // namespace mesh_indexing +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/mesh_indexing/mesh_indexing.cuh b/dnn/src/cuda/mesh_indexing/mesh_indexing.cuh new file mode 100644 index 00000000..24610045 --- /dev/null +++ b/dnn/src/cuda/mesh_indexing/mesh_indexing.cuh @@ -0,0 +1,98 @@ +/** + * \file dnn/src/cuda/mesh_indexing/mesh_indexing.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include +#include "megdnn/basic_types.h" +#include "src/cuda/error_info.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace mesh_indexing { + +// template +struct KernIndexer { + int ndim; + int* ptrs[TensorShape::MAX_NDIM]; + int origin_stride[TensorShape::MAX_NDIM]; + int indexed_strde[TensorShape::MAX_NDIM]; + int desc_stride[TensorShape::MAX_NDIM]; + uint32_t indexed_shape[TensorShape::MAX_NDIM]; + uint32_t origin_shape[TensorShape::MAX_NDIM]; + + void* error_tracker; + megcore::AsyncErrorInfo* error_info; + bool batch_mode; + uint32_t batch_stride; + uint32_t size; + + KernIndexer(const TensorLayout& origin_layout, + const TensorLayout& indexed_layout, int** _ptrs, + const TensorLayout* desc_layouts, + void* _err_tracker = nullptr, + megcore::AsyncErrorInfo* _err_info = nullptr, + bool _batch_mode = false) + : error_tracker(_err_tracker), + error_info(_err_info), + batch_mode(_batch_mode), + size(indexed_layout.total_nr_elems()) { + ndim = origin_layout.ndim; + for (int i = 0; i < ndim; ++i) { + origin_stride[i] = origin_layout.stride[i]; + indexed_strde[i] = indexed_layout.stride[i]; + origin_shape[i] = origin_layout[i]; + indexed_shape[i] = indexed_layout[i]; + ptrs[i] = _ptrs[i]; + desc_stride[i] = desc_layouts[i].stride[0]; + } + } + + int __device__ __forceinline__ convert_indxer(uint32_t& index) const { + int data_offset = 0; + int value_offset = 0; + uint32_t n = 0; + if (batch_mode) { + n = index; + for (int i = ndim - 1; i >= 1; --i) { + n /= indexed_shape[i]; + } + n %= indexed_shape[0]; + } + for (int i = ndim - 1; i >= 0; --i) { + int pos = index % indexed_shape[i]; + value_offset += pos * indexed_strde[i]; + if (ptrs[i]) { + pos += n * desc_stride[i]; + pos = ptrs[i][pos]; + pos += (pos < 0 ? origin_shape[i] : 0); + } + if (static_cast(pos) >= origin_shape[i]) { + set_async_error_info(error_info, error_tracker, + "invalid mesh indexing: " + "indexer=%d idx=%d shape=%d", + i, pos, origin_shape[i]); + } + data_offset += pos * origin_stride[i]; + index /= indexed_shape[i]; + } + + index = value_offset; + return data_offset; + } +}; + +template +void mesh_indexing_proxy(T* origin, T* indexed, KernIndexer* indexer, + cudaStream_t stream); +} // namespace mesh_indexing +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/mesh_indexing/opr_impl.cpp b/dnn/src/cuda/mesh_indexing/opr_impl.cpp new file mode 100644 index 00000000..4aefe2db --- /dev/null +++ b/dnn/src/cuda/mesh_indexing/opr_impl.cpp @@ -0,0 +1,168 @@ +/** + * \file dnn/src/cuda/mesh_indexing/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "opr_impl.h" +#include "mesh_indexing.cuh" +#include "src/common/indexing_multi_axis_vec_kdef.h" +#include "src/cuda/indexing_multi_axis_vec/kern.cuh" +#include "src/cuda/utils.h" + +namespace { +using namespace megdnn; +using namespace cuda; +using namespace mesh_indexing; +KernIndexer get_indexer(const TensorND& origin, const TensorND& indexed, + const MeshBase::IndexDesc& desc, void* error_tracker, + megcore::AsyncErrorInfo* error_info, bool batched) { + int* tmp_ptrs[TensorShape::MAX_NDIM] = {nullptr}; + TensorLayout desc_layouts[TensorShape::MAX_NDIM]; + for (size_t i = 0; i < desc.size(); ++i) { + auto axis = desc[i].axis; + megdnn_assert(axis < TensorShape::MAX_NDIM); + tmp_ptrs[axis] = desc[i].vec.ptr(); + desc_layouts[axis] = desc[i].vec.layout; + } + return {origin.layout, indexed.layout, tmp_ptrs, desc_layouts, + error_tracker, error_info, batched}; +} + +template +void do_exec(const TensorND& data, const TensorND& value, + const MeshBase::IndexDesc& desc, Handle* handle, + void* error_tracker) { + auto error_info = async_error_info(handle); + auto indexer = + get_indexer(data, value, desc, error_tracker, error_info, batched); + + auto stream = cuda_stream(handle); + mesh_indexing::mesh_indexing_proxy( + data.ptr(), value.ptr(), &indexer, stream); +} + +} // namespace + +namespace megdnn { +namespace cuda { + +/* =========================== MeshIndexing ============================ */ + +void MeshIndexingImpl::exec(_megdnn_tensor_in src, const IndexDesc& desc, + _megdnn_tensor_out dst, _megdnn_workspace) { + check_exec(src.layout, dst.layout, desc); +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + do_exec( \ + src, dst, desc, handle(), m_error_tracker); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(0); +} + +/* ========================= BatchedMeshIndexing ========================== */ + +void BatchedMeshIndexingImpl::exec(_megdnn_tensor_in src, const IndexDesc& desc, + _megdnn_tensor_out dst, _megdnn_workspace) { + check_exec(src.layout, dst.layout, desc); + +#define cb(DType) \ + if (dst.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + do_exec( \ + src, dst, desc, handle(), m_error_tracker); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(0); +} + +/* ============================ Mesh ============================= */ + +void IncrMeshIndexingImpl::exec(_megdnn_tensor_inout data, + _megdnn_tensor_in value, const IndexDesc& desc, + _megdnn_workspace) { + check_exec(data.layout, value.layout, desc); + +#define cb(DType) \ + if (data.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + do_exec( \ + data, value, desc, handle(), m_error_tracker); \ + return; \ + } + + cb(dtype::Float32); + cb(dtype::Int32); +#undef cb + megdnn_assert_internal(0); +} + +void SetMeshIndexingImpl::exec(_megdnn_tensor_inout data, + _megdnn_tensor_in value, const IndexDesc& desc, + _megdnn_workspace) { + check_exec(data.layout, value.layout, desc); + +#define cb(DType) \ + if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + do_exec( \ + data, value, desc, handle(), m_error_tracker); \ + return; \ + } + + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(0); +} + +/* ========================== BatchedMesh ============================= */ +void BatchedIncrMeshIndexingImpl::exec(_megdnn_tensor_inout data, + _megdnn_tensor_in value, + const IndexDesc& desc, + _megdnn_workspace) { + check_exec(data.layout, value.layout, desc); + +#define cb(DType) \ + if (data.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + do_exec( \ + data, value, desc, handle(), m_error_tracker); \ + return; \ + } + cb(dtype::Float32); + cb(dtype::Int32); +#undef cb + megdnn_assert_internal(0); +} + +void BatchedSetMeshIndexingImpl::exec(_megdnn_tensor_inout data, + _megdnn_tensor_in value, + const IndexDesc& desc, + _megdnn_workspace) { + check_exec(data.layout, value.layout, desc); + +#define cb(DType) \ + if (data.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + do_exec( \ + data, value, desc, handle(), m_error_tracker); \ + return; \ + } + + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb + megdnn_assert_internal(0); +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/mesh_indexing/opr_impl.h b/dnn/src/cuda/mesh_indexing/opr_impl.h new file mode 100644 index 00000000..8030429b --- /dev/null +++ b/dnn/src/cuda/mesh_indexing/opr_impl.h @@ -0,0 +1,104 @@ +/** + * \file dnn/src/cuda/mesh_indexing/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs.h" +#include "src/common/utils.h" + +namespace megdnn { +namespace cuda { + +class MeshIndexingImpl : public MeshIndexing { + void* m_error_tracker = nullptr; + +public: + using MeshIndexing::MeshIndexing; + + void exec(_megdnn_tensor_in src, const IndexDesc& desc, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +class IncrMeshIndexingImpl : public IncrMeshIndexing { + void* m_error_tracker = nullptr; + +public: + using IncrMeshIndexing::IncrMeshIndexing; + + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc& desc, _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +class SetMeshIndexingImpl : public SetMeshIndexing { + void* m_error_tracker = nullptr; + +public: + using SetMeshIndexing::SetMeshIndexing; + + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc& desc, _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +class BatchedMeshIndexingImpl : public BatchedMeshIndexing { + void* m_error_tracker = nullptr; + +public: + using BatchedMeshIndexing::BatchedMeshIndexing; + + void exec(_megdnn_tensor_in src, const IndexDesc& desc, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +class BatchedIncrMeshIndexingImpl : public BatchedIncrMeshIndexing { + void* m_error_tracker = nullptr; + +public: + using BatchedIncrMeshIndexing::BatchedIncrMeshIndexing; + + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc& desc, _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +class BatchedSetMeshIndexingImpl : public BatchedSetMeshIndexing { + void* m_error_tracker = nullptr; + +public: + using BatchedSetMeshIndexing::BatchedSetMeshIndexing; + + void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value, + const IndexDesc& desc, _megdnn_workspace workspace) override; + + void set_error_tracker(void* tracker) override { + m_error_tracker = tracker; + } +}; + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/param_pack/opr_impl.cpp b/dnn/src/cuda/param_pack/opr_impl.cpp new file mode 100644 index 00000000..ab167735 --- /dev/null +++ b/dnn/src/cuda/param_pack/opr_impl.cpp @@ -0,0 +1,115 @@ +/** + * \file dnn/src/cuda/param_pack/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/param_pack/opr_impl.h" +#include "src/cuda/param_pack/param_pack.cuh" +#include "src/cuda/utils.h" + +namespace megdnn { +namespace cuda { + +size_t ParamPackConcatImpl::get_workspace_in_bytes(const TensorShapeArray& srcs, + const TensorShape&, + const TensorShape&) { + return sizeof(size_t) * srcs.size(); +} + +template +void ParamPackConcatImpl::exec_internal(_megdnn_tensor_in srcs, + _megdnn_tensor_in table, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + size_t inp_size = srcs.layout.shape[0], + out_size = dst.layout.total_nr_elems(); + auto stream = cuda_stream(this->handle()); + + auto src_cpu = static_cast(srcs.raw_ptr); + megdnn_assert_internal(src_cpu); + auto src_gpu = reinterpret_cast(workspace.raw_ptr); + + auto table_outer_gpu = table.ptr(), + table_inner_gpu = table_outer_gpu + out_size; + + cuda_check(cudaMemcpyAsync(src_gpu, src_cpu, sizeof(const T*) * inp_size, + cudaMemcpyHostToDevice, stream)); + + param_pack::concat_proxy(src_gpu, dst.ptr(), out_size, + table_outer_gpu, table_inner_gpu, stream); +} + +void ParamPackConcatImpl::exec(_megdnn_tensor_in srcs, _megdnn_tensor_in table, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + check_exec(dst.layout, table.layout, srcs.layout); +#define cb(DType) \ + if (dst.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_internal(srcs, table, dst, workspace); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + megdnn_throw("bad type"); +#undef cb +} + +size_t ParamPackSplitImpl::get_workspace_in_bytes( + const TensorShape&, const TensorShape&, const TensorShapeArray& dsts) { + return sizeof(size_t) * dsts.size(); +} + +template +void ParamPackSplitImpl::exec_internal(_megdnn_tensor_in src, + _megdnn_tensor_in table, + _megdnn_tensor_out dsts, + _megdnn_workspace workspace) { + // inner and outer table must be int32 + megdnn_assert(table.layout.dtype == dtype::Int32()); + // dsts is src pointer, ndim must be 1 + megdnn_assert(dsts.layout.ndim == 1); + + auto out_size = dsts.layout.shape[0], + inp_size = src.layout.total_nr_elems(); + + auto stream = cuda_stream(this->handle()); + + auto total_workspace_size = sizeof(T*) * out_size; + auto dsts_cpu = static_cast(dsts.raw_ptr); + megdnn_assert_internal(dsts_cpu); + auto dsts_gpu = reinterpret_cast(workspace.raw_ptr); + + auto table_outer_gpu = table.ptr(); + auto table_inner_gpu = table_outer_gpu + inp_size; + + cuda_check(cudaMemcpyAsync(dsts_gpu, dsts_cpu, total_workspace_size, + cudaMemcpyHostToDevice, stream)); + + // param_pack_split_proxy() + param_pack::split_proxy(src.ptr(), dsts_gpu, inp_size, + table_outer_gpu, table_inner_gpu, stream); +} + +void ParamPackSplitImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in table, + _megdnn_tensor_out dsts, + _megdnn_workspace workspace) { + check_exec(src.layout, table.layout, dsts.layout); +#define cb(DType) \ + if (src.layout.dtype == DType()) { \ + using ctype = typename DTypeTrait::ctype; \ + exec_internal(src, table, dsts, workspace); \ + return; \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + megdnn_throw("bad type"); +#undef cb +} + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/param_pack/opr_impl.h b/dnn/src/cuda/param_pack/opr_impl.h new file mode 100644 index 00000000..ab46434e --- /dev/null +++ b/dnn/src/cuda/param_pack/opr_impl.h @@ -0,0 +1,51 @@ +/** + * \file dnn/src/cuda/param_pack/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "megdnn/oprs.h" + +namespace megdnn { +namespace cuda { + +class ParamPackConcatImpl final : public ParamPackConcat { +public: + using ParamPackConcat::ParamPackConcat; + void exec(_megdnn_tensor_in srcs, _megdnn_tensor_in table, + _megdnn_tensor_out dst, _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorShapeArray& srcs, + const TensorShape& table, + const TensorShape& dst) override; + +private: + template + void exec_internal(_megdnn_tensor_in srcs, _megdnn_tensor_in table, + _megdnn_tensor_out dst, _megdnn_workspace workspace); +}; + +class ParamPackSplitImpl final : public ParamPackSplit { +public: + using ParamPackSplit::ParamPackSplit; + void exec(_megdnn_tensor_in src, _megdnn_tensor_in table, + _megdnn_tensor_out dsts, _megdnn_workspace workspace) override; + + size_t get_workspace_in_bytes(const TensorShape& src, + const TensorShape& table, + const TensorShapeArray& dsts) override; + +private: + template + void exec_internal(_megdnn_tensor_in src, _megdnn_tensor_in table, + _megdnn_tensor_out dsts, _megdnn_workspace workspace); +}; + +} // namespace cuda +} // namespace megdnn diff --git a/dnn/src/cuda/param_pack/param_pack.cu b/dnn/src/cuda/param_pack/param_pack.cu new file mode 100644 index 00000000..03e98509 --- /dev/null +++ b/dnn/src/cuda/param_pack/param_pack.cu @@ -0,0 +1,87 @@ +/** + * \file dnn/src/cuda/param_pack/param_pack.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/dtype.h" +#include "src/cuda/param_pack/param_pack.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace param_pack { + +template +__global__ void concat_kernel(const T** srcs, T* dst, + const int32_t* table_outer, + const int32_t* table_inner, + size_t total_size) { + size_t addr = threadIdx.x + blockIdx.x * blockDim.x; + if (addr < total_size) { + int32_t i = table_outer[addr]; + int32_t idx = table_inner[addr]; + if (idx != -1) + dst[addr] = srcs[i][idx]; + else + dst[addr] = 0; + } +} + +template +__global__ void split_kernel(const T* src, T** dsts, + const int32_t* table_outer, + const int32_t* table_inner, + size_t total_size) { + size_t addr = threadIdx.x + blockIdx.x * blockDim.x; + if (addr < total_size) { + int32_t i = table_outer[addr]; + int32_t idx = table_inner[addr]; + if (idx != -1) { + dsts[i][idx] = src[addr]; + } + } +} + +template +void split_proxy(const T* src, T** dsts, size_t total_size, + const int32_t* table_outer, const int32_t* table_inner, + cudaStream_t stream) { + size_t NR_BLOCKS = DIVUP(total_size, NR_THREADS); + split_kernel<<>>( + src, dsts, table_outer, table_inner, total_size); + after_kernel_launch(); +} + +template +void concat_proxy(const T** srcs, T* dst, size_t total_size, + const int32_t* table_outer, + const int32_t* table_inner, cudaStream_t stream) { + size_t NR_BLOCKS = DIVUP(total_size, NR_THREADS); + concat_kernel<<>>( + srcs, dst, table_outer, table_inner, total_size); + after_kernel_launch(); +} + +#define INST(T) \ + template void concat_proxy(const T**, T*, size_t, \ + const int32_t*, const int32_t*, \ + cudaStream_t); \ + template void split_proxy(const T*, T**, size_t, \ + const int32_t*, const int32_t*, \ + cudaStream_t); +#define cb(DType) INST(typename DTypeTrait::ctype) +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +#undef cb +#undef INST + +} // namespace param_pack +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/param_pack/param_pack.cuh b/dnn/src/cuda/param_pack/param_pack.cuh new file mode 100644 index 00000000..4946f05b --- /dev/null +++ b/dnn/src/cuda/param_pack/param_pack.cuh @@ -0,0 +1,36 @@ +/** + * \file dnn/src/cuda/param_pack/param_pack.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include + +#include +#include + +namespace megdnn { +namespace cuda { +namespace param_pack { + +template +void split_proxy(const T* src, T** dsts, size_t total_size, + const int32_t* table_outer, const int32_t* table_inner, + cudaStream_t stream); + +template +void concat_proxy(const T** srcs, T* dst, size_t total_size, + const int32_t* table_outer, + const int32_t* table_inner, cudaStream_t stream); + +} // namespace param_pack +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/pooling/opr_impl.cpp b/dnn/src/cuda/pooling/opr_impl.cpp new file mode 100644 index 00000000..3d9b351e --- /dev/null +++ b/dnn/src/cuda/pooling/opr_impl.cpp @@ -0,0 +1,96 @@ +/** + * \file dnn/src/cuda/pooling/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "src/cuda/pooling/opr_impl.h" + +#include "src/cuda/utils.h" +#include "./pooling2d_int8_cdiv4hwn4.cuh" + +namespace megdnn { +namespace cuda { + +void PoolingForwardImpl::setup_descs(const TensorLayout &src, + const TensorLayout &dst) +{ + src_desc.set(src, param().format); + dst_desc.set(dst, param().format); + pooling_desc.set(this->param()); +} + +void PoolingForwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, workspace.size); + using Format = param::Pooling::Format; + if (param().format == Format::CHWN4) { + pooling2d::Param kern_param; + size_t c = src.layout[0], hi = src.layout[1], wi = src.layout[2], + n = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; + c = c * 4; + size_t ph = param().pad_h, pw = param().pad_w; + size_t window_h = param().window_h, window_w = param().window_w; + size_t sh = param().stride_h, sw = param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, + kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, + kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h, + kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; + auto&& stream = cuda_stream(handle()); + return pooling2d::_do_pooling2d_int8_cdiv4hwn4( + src.compatible_ptr(), dst.compatible_ptr(), + kern_param, stream, static_cast(param().mode)); + } + auto handle = cudnn_handle(this->handle()); + setup_descs(src.layout, dst.layout); + dt_float32 alpha = 1.0f, beta = 0.0f; + cudnn_check(cudnnPoolingForward(handle, + pooling_desc.desc, + &alpha, + src_desc.desc, src.raw_ptr, + &beta, + dst_desc.desc, dst.raw_ptr)); +} + +void PoolingBackwardImpl::setup_descs(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &diff, + const TensorLayout &grad) +{ + src_desc.set(src); + dst_desc.set(dst); + diff_desc.set(diff); + grad_desc.set(grad); + pooling_desc.set(this->param()); +} + +void PoolingBackwardImpl::exec(_megdnn_tensor_in src, + _megdnn_tensor_in dst, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) +{ + check_exec(src.layout, dst.layout, diff.layout, grad.layout, workspace.size); + auto handle = cudnn_handle(this->handle()); + setup_descs(src.layout, dst.layout, diff.layout, grad.layout); + float alpha = 1.0f, beta = 0.0f; + cudnn_check(cudnnPoolingBackward(handle, + pooling_desc.desc, + &alpha, + dst_desc.desc, dst.raw_ptr, + diff_desc.desc, diff.raw_ptr, + src_desc.desc, src.raw_ptr, + &beta, + grad_desc.desc, grad.raw_ptr)); +} + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/pooling/opr_impl.h b/dnn/src/cuda/pooling/opr_impl.h new file mode 100644 index 00000000..86599fd7 --- /dev/null +++ b/dnn/src/cuda/pooling/opr_impl.h @@ -0,0 +1,61 @@ +/** + * \file dnn/src/cuda/pooling/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include "megdnn/oprs.h" + +#include "src/cuda/cudnn_wrapper.h" + +namespace megdnn { +namespace cuda { + +class PoolingForwardImpl final: public PoolingForward { + public: + using PoolingForward::PoolingForward; + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &) override { + return 0; + } + private: + TensorDesc src_desc, dst_desc; + PoolingDesc pooling_desc; + void setup_descs(const TensorLayout &src, const TensorLayout &dst); +}; + +class PoolingBackwardImpl final: public PoolingBackward { + public: + using PoolingBackward::PoolingBackward; + void exec(_megdnn_tensor_in src, + _megdnn_tensor_in dst, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) override; + size_t get_workspace_in_bytes(const TensorLayout &, + const TensorLayout &, + const TensorLayout &, + const TensorLayout &) override { + return 0; + } + private: + TensorDesc src_desc, dst_desc, diff_desc, grad_desc; + PoolingDesc pooling_desc; + void setup_descs(const TensorLayout &src, + const TensorLayout &dst, + const TensorLayout &diff, + const TensorLayout &grad); + +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cpp b/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cpp new file mode 100644 index 00000000..46766cc9 --- /dev/null +++ b/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cpp @@ -0,0 +1,27 @@ +/** + * \file dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./pooling2d_int8_cdiv4hwn4.cuh" +#include "src/cuda/query_blocksize.cuh" + +namespace megdnn { +namespace cuda { +namespace pooling2d { + +uint32_t _get_kern_block_size(const void* kern) { + uint32_t ret = query_blocksize_for_kernel(kern); + return ret; +} + +} // namespace pooling2d +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cu b/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cu new file mode 100644 index 00000000..179a7884 --- /dev/null +++ b/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cu @@ -0,0 +1,413 @@ +/** + * \file dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "./pooling2d_int8_cdiv4hwn4.cuh" +#include "src/common/opr_param_defs_enumv.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace pooling2d; + +namespace { +// common macros +#define FEED1 Base::feed(x, 0); +#define FEED2 \ + Base::feed(x.x, 0); \ + Base::feed(x.y, 4); +#define FEED4 \ + FEED2; \ + Base::feed(x.z, 8); \ + Base::feed(x.w, 12); + +#define ANS1(cb) cb(Base::res[0], Base::res[1], Base::res[2], Base::res[3], i1); + +#define ANS2(cb) \ + ANS1(cb); \ + cb(Base::res[4], Base::res[5], Base::res[6], Base::res[7], i2); + +#define ANS4(cb) \ + ANS2(cb); \ + cb(Base::res[8], Base::res[9], Base::res[10], Base::res[11], i3); \ + cb(Base::res[12], Base::res[13], Base::res[14], Base::res[15], i4); + +__device__ __forceinline__ int pack_int8_to_int8x4(int8_t x, int8_t y, int8_t z, + int8_t w) { + int ix = static_cast(x), iy = static_cast(y), + iz = static_cast(z), iw = static_cast(w); + + asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(ix) : "r"(iy)); + asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(iz) : "r"(iw)); + asm volatile("prmt.b32 %0, %0, %1, 0x5410;" : "+r"(ix) : "r"(iz)); + return ix; +} + +template +struct MaxPoolerBase; + +template +struct MaxPoolerBase { + static constexpr int nr_results = sizeof(feed_type) / sizeof(int8_t); + int8_t res[nr_results]; + + __device__ MaxPoolerBase(int) {} + __device__ __forceinline__ void init() { +#pragma unroll + for (int i = 0; i < nr_results; ++i) { + res[i] = -128; + } + } + __device__ __forceinline__ void feed(int32_t x, int idx) { + int8_t ix = (x & 0xff); + int8_t iy = ((x >> 8) & 0xff); + int8_t iz = ((x >> 16) & 0xff); + int8_t iw = ((x >> 24) & 0xff); + res[idx] = res[idx] > ix ? res[idx] : ix; + res[idx + 1] = res[idx + 1] > iy ? res[idx + 1] : iy; + res[idx + 2] = res[idx + 2] > iz ? res[idx + 2] : iz; + res[idx + 3] = res[idx + 3] > iw ? res[idx + 3] : iw; + } +}; + +template +struct MaxPooler; + +#define SPEC_WITH_FEED_TYPE(_feed_type) \ + template <> \ + struct MaxPooler : MaxPoolerBase + +#define COMMON_DEFS(_feed_type) \ + using feed_type = _feed_type; \ + using Base = MaxPoolerBase; \ + using MaxPoolerBase::MaxPoolerBase; + +#define cb(_x, _y, _z, _w, _ret) \ + { _ret = pack_int8_to_int8x4(_x, _y, _z, _w); } + +SPEC_WITH_FEED_TYPE(int32_t) { + COMMON_DEFS(int32_t); + __device__ __forceinline__ void feed(int32_t x) { FEED1; } + + __device__ __forceinline__ int get_ans() { + int i1; + ANS1(cb); + return i1; + } +}; + +SPEC_WITH_FEED_TYPE(int2) { + COMMON_DEFS(int2); + __device__ __forceinline__ void feed(int2 x) { FEED2; } + __device__ __forceinline__ int2 get_ans() { + int i1, i2; + ANS2(cb); + return ::make_int2(i1, i2); + } +}; + +SPEC_WITH_FEED_TYPE(int4) { + COMMON_DEFS(int4); + __device__ __forceinline__ void feed(int4 x) { FEED4; } + + __device__ __forceinline__ int4 get_ans() { + int i1, i2, i3, i4; + ANS4(cb); + return ::make_int4(i1, i2, i3, i4); + } +}; + +#undef cb +#undef COMMON_DEFS +#undef SPEC_WITH_FEED_TYPE + +template +struct MeanIncludeRoundedPoolerBase; + +template +struct MeanIncludeRoundedPoolerBase { + static constexpr int nr_results = sizeof(feed_type) / sizeof(int8_t); + int32_t res[nr_results]; + const int count; + const float fi_count; + + __device__ MeanIncludeRoundedPoolerBase(int count) + : count{count}, fi_count{1.f / count} {} + __device__ __forceinline__ void init() { +#pragma unroll + for (int i = 0; i < nr_results; ++i) { + res[i] = 0; + } + } + + __device__ __forceinline__ void feed(int32_t x, int idx) { + int8_t ix = (x & 0xff); + int8_t iy = ((x >> 8) & 0xff); + int8_t iz = ((x >> 16) & 0xff); + int8_t iw = ((x >> 24) & 0xff); + res[idx] += static_cast(ix); + res[idx + 1] += static_cast(iy); + res[idx + 2] += static_cast(iz); + res[idx + 3] += static_cast(iw); + } +}; + +template +struct MeanIncludeRoundedPooler; + +#define SPEC_WITH_FEED_TYPE(_feed_type) \ + template <> \ + struct MeanIncludeRoundedPooler \ + : MeanIncludeRoundedPoolerBase + +#define COMMON_DEFS(_feed_type) \ + using feed_type = _feed_type; \ + using Base = MeanIncludeRoundedPoolerBase; \ + using MeanIncludeRoundedPoolerBase::MeanIncludeRoundedPoolerBase; + +#define cb(_x, _y, _z, _w, _ret) \ + { \ + float fx = roundf(static_cast(_x) * Base::fi_count); \ + float fy = roundf(static_cast(_y) * Base::fi_count); \ + float fz = roundf(static_cast(_z) * Base::fi_count); \ + float fw = roundf(static_cast(_w) * Base::fi_count); \ + _ret = transform_float4_to_int8x4(::make_float4(fx, fy, fz, fw)); \ + } + +SPEC_WITH_FEED_TYPE(int32_t) { + COMMON_DEFS(int32_t); + __device__ __forceinline__ void feed(int32_t x) { FEED1; } + + __device__ __forceinline__ int get_ans() { + int i1; + ANS1(cb); + return i1; + } +}; + +SPEC_WITH_FEED_TYPE(int2) { + COMMON_DEFS(int2); + __device__ __forceinline__ void feed(int2 x) { FEED2; } + __device__ __forceinline__ int2 get_ans() { + int i1, i2; + ANS2(cb); + return ::make_int2(i1, i2); + } +}; + +SPEC_WITH_FEED_TYPE(int4) { + COMMON_DEFS(int4); + __device__ __forceinline__ void feed(int4 x) { FEED4; } + + __device__ __forceinline__ int4 get_ans() { + int i1, i2, i3, i4; + ANS4(cb); + return ::make_int4(i1, i2, i3, i4); + } +}; + +#undef cb +#undef COMMON_DEFS +#undef SPEC_WITH_FEED_TYPE + +template +struct MeanExcludeRoundedPoolerBase; + +template +struct MeanExcludeRoundedPoolerBase { + static const int nr_results = sizeof(feed_type) / sizeof(int8_t); + int32_t res[nr_results]; + int count; + + __device__ MeanExcludeRoundedPoolerBase(int /* count */) {} + __device__ __forceinline__ void init() { +#pragma unroll + for (int i = 0; i < nr_results; ++i) { + res[i] = 0; + } + count = 0; + } + + __device__ __forceinline__ void feed(int32_t x, int idx) { + int8_t ix = (x & 0xff); + int8_t iy = ((x >> 8) & 0xff); + int8_t iz = ((x >> 16) & 0xff); + int8_t iw = ((x >> 24) & 0xff); + res[idx] += static_cast(ix); + res[idx + 1] += static_cast(iy); + res[idx + 2] += static_cast(iz); + res[idx + 3] += static_cast(iw); + } +}; + +template +struct MeanExcludeRoundedPooler; + +#define SPEC_WITH_FEED_TYPE(_feed_type) \ + template <> \ + struct MeanExcludeRoundedPooler \ + : MeanExcludeRoundedPoolerBase + +#define COMMON_DEFS(_feed_type) \ + using feed_type = _feed_type; \ + using Base = MeanExcludeRoundedPoolerBase; \ + using MeanExcludeRoundedPoolerBase::MeanExcludeRoundedPoolerBase; + +#define cb(_x, _y, _z, _w, _ret) \ + { \ + float fx = roundf(static_cast(_x) / Base::count); \ + float fy = roundf(static_cast(_y) / Base::count); \ + float fz = roundf(static_cast(_z) / Base::count); \ + float fw = roundf(static_cast(_w) / Base::count); \ + _ret = transform_float4_to_int8x4(::make_float4(fx, fy, fz, fw)); \ + } + +SPEC_WITH_FEED_TYPE(int32_t) { + COMMON_DEFS(int32_t); + __device__ __forceinline__ void feed(int32_t x) { + FEED1; + count++; + } + + __device__ __forceinline__ int get_ans() { + int i1; + ANS1(cb); + return i1; + } +}; + +SPEC_WITH_FEED_TYPE(int2) { + COMMON_DEFS(int2); + __device__ __forceinline__ void feed(int2 x) { + FEED2; + count++; + } + __device__ __forceinline__ int2 get_ans() { + int i1, i2; + ANS2(cb); + return ::make_int2(i1, i2); + } +}; + +SPEC_WITH_FEED_TYPE(int4) { + COMMON_DEFS(int4); + __device__ __forceinline__ void feed(int4 x) { + FEED4; + count++; + } + + __device__ __forceinline__ int4 get_ans() { + int i1, i2, i3, i4; + ANS4(cb); + return ::make_int4(i1, i2, i3, i4); + } +}; + +#undef cb +#undef COMMON_DEFS +#undef SPEC_WITH_FEED_TYPE + +template +__global__ void pooling2d_device_template_int8_cdiv4hwn4( + const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + using ldg_type = typename Pooler::feed_type; + static int constexpr pack_size = 4; + static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); + const int batch = (bidy * blockDim.x + tidx) * ldg_width; + const int packed_ch = bidz * blockDim.y + tidy; + const int npack = param.n * pack_size; + if (batch >= param.n || packed_ch >= param.c / pack_size) + return; + + const int ho = bidx / param.wo; + const int wo = bidx - param.wo * ho; + const int input_pixels = param.hi * param.wi; + const int output_pixels = param.ho * param.wo; + const int8_t* __restrict__ g_src_ptr = + src + batch * pack_size + packed_ch * input_pixels * npack; + int8_t* __restrict__ g_dst_ptr = dst + batch * pack_size + + packed_ch * output_pixels * npack + + (ho * param.wo + wo) * npack; + + Pooler pooler(param.window_h * param.window_w); + pooler.init(); + for (int fh = 0; fh < param.window_h; fh++) { + uint32_t ih = ho * param.sh + fh - param.ph; + for (int fw = 0; fw < param.window_w; fw++) { + uint32_t iw = wo * param.sw + fw - param.pw; + if (ih < param.hi && iw < param.wi) { + const int8_t* __restrict__ cur_src_ptr = + g_src_ptr + (ih * param.wi + iw) * npack; + ldg_type sval = + __ldg(reinterpret_cast(cur_src_ptr)); + pooler.feed(sval); + } + } + } + ldg_type res = pooler.get_ans(); + *(reinterpret_cast(g_dst_ptr)) = res; +} +}; // namespace + +void megdnn::cuda::pooling2d::_do_pooling2d_int8_cdiv4hwn4( + const int8_t* d_src, int8_t* d_dst, const Param& param, + cudaStream_t stream, uint32_t mode) { + using Mode = megdnn::param_enumv::Pooling::Mode; + void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); + uint32_t vthreads_x = 0, vthreads_y = param.c / 4; +#define dispatch_pooling_mode(_feed_type) \ + switch (mode) { \ + case Mode::MAX: \ + kern = pooling2d_device_template_int8_cdiv4hwn4< \ + MaxPooler>; \ + break; \ + case Mode::AVERAGE: \ + kern = pooling2d_device_template_int8_cdiv4hwn4< \ + MeanIncludeRoundedPooler>; \ + break; \ + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: \ + kern = pooling2d_device_template_int8_cdiv4hwn4< \ + MeanExcludeRoundedPooler>; \ + break; \ + default: \ + megdnn_assert(false, "invalid pooling mode"); \ + } + if (param.n % 4 == 0) { + dispatch_pooling_mode(int4); + vthreads_x = param.n / 4; + } else if (param.n % 2 == 0) { + dispatch_pooling_mode(int2); + vthreads_x = param.n / 2; + } else { + dispatch_pooling_mode(int32_t); + vthreads_x = param.n; + } +#undef dispatch_pooling_mode + constexpr uint32_t threads_x = 16; + uint32_t nr_threads = + _get_kern_block_size(reinterpret_cast(kern)); + uint32_t nr_threads_x = std::min(threads_x, vthreads_x), + nr_threads_y = std::min(nr_threads / nr_threads_x, vthreads_y); + uint32_t nr_blocks_x = param.ho * param.wo, + nr_blocks_y = DIVUP(vthreads_x, nr_threads_x), + nr_blocks_z = DIVUP(vthreads_y, nr_threads_y); + dim3 threads{nr_threads_x, nr_threads_y, 1}; + dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + kern<<>>(d_src, d_dst, param); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cuh b/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cuh new file mode 100644 index 00000000..6e709eed --- /dev/null +++ b/dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cuh @@ -0,0 +1,34 @@ +/** + * \file dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace pooling2d { + +struct Param { + int n, c, hi, wi, ho, wo, ph, pw, window_h, window_w, sh, sw; +}; + +uint32_t _get_kern_block_size(const void* kern); + +void _do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src, int8_t* d_dst, + const Param& param, cudaStream_t stream, + uint32_t mode); + +} // namespace pooling2d +} // namespace cuda +} // namespace megdnn + + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/powc/kern.cu b/dnn/src/cuda/powc/kern.cu new file mode 100644 index 00000000..1882486b --- /dev/null +++ b/dnn/src/cuda/powc/kern.cu @@ -0,0 +1,231 @@ +/** + * \file dnn/src/cuda/powc/kern.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./kern.cuh" +#include "megdnn/dtype.h" +#include "src/cuda/elemwise_helper.cuh" + +using namespace megdnn; +using namespace cuda; + +#include +#include + +// use a namespace (but not anonymous namespace) to avoid name confliction while +// maintaining readability of cuda kernel names +namespace cuda_kern { + +template +struct PowCIntSmall; + +template <> +struct PowCIntSmall<0> { + template + static __device__ __forceinline__ T apply(T) { + return static_cast(1); + } +}; +template <> +struct PowCIntSmall<1> { + template + static __device__ __forceinline__ T apply(T x) { + return x; + } +}; +template <> +struct PowCIntSmall<2> { + template + static __device__ __forceinline__ T apply(T x) { + return x * x; + } +}; +template <> +struct PowCIntSmall<3> { + template + static __device__ __forceinline__ T apply(T x) { + return x * x * x; + } +}; +template <> +struct PowCIntSmall<4> { + template + static __device__ __forceinline__ T apply(T x) { + x = x * x; + return x * x; + } +}; +template +struct PowCIntSmall { + template + static __device__ __forceinline__ T apply(T x) { + return PowCIntSmall<-n>::apply(static_cast(1) / x); + } +}; + +template +struct PowCIntOdd { + T exp; + + __device__ __forceinline__ T apply(T x) { + return static_cast(copysignf(powf(fabsf(x), exp), x)); + } +}; + +template +struct PowCIntEven { + T exp; + + __device__ __forceinline__ T apply(T x) { + return static_cast(powf(fabsf(x), exp)); + } +}; + +struct PowCFloatSqrt { + template + static __device__ __forceinline__ T apply(T x) { + return static_cast(sqrtf(x)); + } +}; + +struct PowCFloatCbrt { + template + static __device__ __forceinline__ T apply(T x) { + return static_cast(cbrtf(x)); + } +}; + +struct PowCFloatRSqrt { + template + static __device__ __forceinline__ T apply(T x) { + return static_cast(rsqrtf(x)); + } +}; + +struct PowCFloatRCbrt { + template + static __device__ __forceinline__ T apply(T x) { + return static_cast(rcbrtf(x)); + } +}; + +template +struct PowCFloat { + T exp; + + __device__ __forceinline__ T apply(T x) { + return static_cast(powf(x, exp)); + } +}; + +template +struct PowCOp { + T* dest; + PowOp pow_op; + + __device__ __forceinline__ void operator()(uint32_t idx, T src) { + dest[idx] = pow_op.apply(src); + } +}; + +} // namespace cuda_kern + +using namespace cuda_kern; + +namespace { + +template +void invoke(const TensorND& dest, const TensorND& src, PowOp pow_op, + cudaStream_t stream) { + ElemwiseOpParamN<1> param; + param[0] = src; + param.init_from_given_tensor(); + typedef PowCOp Op; + Op op; + op.dest = dest.ptr(); + op.pow_op = pow_op; + run_elemwise(param, stream, op); +} + +bool feq(float a, float b) { + return std::abs(a - b) < std::numeric_limits::epsilon(); +} + +template +void dispatch_op(const TensorND& dest, const TensorND& src, const float* exp_f, + const int* exp_i, cudaStream_t stream) { +#define CALL(_op) invoke(dest, src, _op, stream) + if (exp_f) { + float exp = *exp_f; +#define CALL_IF(_v, _op) \ + do { \ + if (feq(exp, _v)) { \ + CALL(_op); \ + return; \ + } \ + } while (0) + CALL_IF(.5f, PowCFloatSqrt()); + CALL_IF(1.f / 3.f, PowCFloatCbrt()); + CALL_IF(-.5f, PowCFloatRSqrt()); + CALL_IF(-1.f / 3.f, PowCFloatRCbrt()); + + PowCFloat op; + op.exp = exp; + CALL(op); + return; +#undef CALL_IF + } + + int exp = *exp_i; + switch (exp) { +#define CASE(v) \ + case v: \ + CALL(PowCIntSmall()); \ + return + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(-1); + CASE(-2); + CASE(-3); + CASE(-4); +#undef CASE + } + if (exp & 1) { + PowCIntOdd op; + op.exp = exp; + CALL(op); + } else { + PowCIntEven op; + op.exp = exp; + CALL(op); + } +#undef CALL +} +} // anonymous namespace + +void cuda::powc_kern(const TensorND& dest, const TensorND& src, + const float* exp_f, const int* exp_i, + cudaStream_t stream) { + switch (src.layout.dtype.enumv().ev) { +#define cb(dt) \ + case DTypeTrait
::enumv: \ + return dispatch_op::ctype>(dest, src, exp_f, exp_i, \ + stream); + MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) +#undef cb + default: + megdnn_throw("unsupported dtype for PowC"); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/powc/kern.cuh b/dnn/src/cuda/powc/kern.cuh new file mode 100644 index 00000000..e9502b9b --- /dev/null +++ b/dnn/src/cuda/powc/kern.cuh @@ -0,0 +1,24 @@ +/** + * \file dnn/src/cuda/powc/kern.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/basic_types.h" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { + +void powc_kern(const TensorND& dest, const TensorND& src, const float* exp_f, + const int* exp_i, cudaStream_t stream); + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/powc/opr_impl.cpp b/dnn/src/cuda/powc/opr_impl.cpp new file mode 100644 index 00000000..e6bb9235 --- /dev/null +++ b/dnn/src/cuda/powc/opr_impl.cpp @@ -0,0 +1,25 @@ +/** + * \file dnn/src/cuda/powc/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./opr_impl.h" +#include "./kern.cuh" + +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +void PowCImpl::do_exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + const float* exp_f, const int* exp_i) { + powc_kern(dst, src, exp_f, exp_i, cuda_stream(handle())); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/powc/opr_impl.h b/dnn/src/cuda/powc/opr_impl.h new file mode 100644 index 00000000..3f2e13f5 --- /dev/null +++ b/dnn/src/cuda/powc/opr_impl.h @@ -0,0 +1,29 @@ +/** + * \file dnn/src/cuda/powc/opr_impl.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once + +#include "megdnn/oprs/general.h" + +namespace megdnn { +namespace cuda { + +class PowCImpl final : public PowC { +public: + using PowC::PowC; + void do_exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + const float* exp_f, const int* exp_i) override; +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen + diff --git a/dnn/src/cuda/query_blocksize.cpp b/dnn/src/cuda/query_blocksize.cpp new file mode 100644 index 00000000..3a60c5be --- /dev/null +++ b/dnn/src/cuda/query_blocksize.cpp @@ -0,0 +1,57 @@ +/** + * \file dnn/src/cuda/query_blocksize.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./query_blocksize.cuh" +#include "src/cuda/utils.h" + +#include +#include + +using namespace megdnn; +using namespace cuda; + +namespace { + +size_t hash_pair_combine(size_t x, size_t y) { + return y + 0x9e3779b9 + (x << 6) + (x >> 2); +} + +//! stupid committee has no pair hash. Let's do it for them +struct pairhash { +public: + template + size_t operator()(const std::pair& x) const { + return hash_pair_combine(std::hash{}(x.first), + std::hash{}(x.second)); + } +}; +} // anonymous namespace + +LaunchConfig cuda::query_launch_config_for_kernel(const void* kern, + const SmemGetter& smem) { + static std::mutex mtx; + static std::unordered_map, LaunchConfig, + pairhash> + cache; + std::lock_guard _lock{mtx}; + + int device = -1; + cuda_check(cudaGetDevice(&device)); + auto ins = cache.insert({{device, kern}, LaunchConfig{}}); + if (ins.second) { + ins.first->second = + detail::query_launch_config_for_kernel_uncached(kern, smem); + } + return ins.first->second; +} + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} + diff --git a/dnn/src/cuda/query_blocksize.cuh b/dnn/src/cuda/query_blocksize.cuh new file mode 100644 index 00000000..0c438c3d --- /dev/null +++ b/dnn/src/cuda/query_blocksize.cuh @@ -0,0 +1,60 @@ +/** + * \file dnn/src/cuda/query_blocksize.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +namespace megdnn { +namespace cuda { + +struct LaunchConfig { + int grid_size; //!< minimal grid size + int block_size; //!< suggested block size +}; + +//! get shared mem size given block size +struct SmemGetter { + typedef int (*func_t)(int block_size, void* user_data); + func_t func; + void* user_data; + + SmemGetter(func_t func_ = 0, void* user_data_ = 0) + : func(func_), user_data(user_data_) {} +}; + +/*! + * \brief cudaOccupancyMaxPotentialBlockSize only available when compiled by + * nvcc; so we need to wrap this function and expose it to normal c++ + * + * Note that the result is cached for kernel ptr. + */ +LaunchConfig query_launch_config_for_kernel( + const void* kern, const SmemGetter& smem = SmemGetter()); + +//! return block size only +static inline int query_blocksize_for_kernel(const void* kern) { + return query_launch_config_for_kernel(kern).block_size; +} + +template +static inline int query_blocksize_for_kernel(T kern) { + return query_blocksize_for_kernel(reinterpret_cast(kern)); +} + +namespace detail { +LaunchConfig query_launch_config_for_kernel_uncached(const void* kern, + const SmemGetter& smem); +} + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} + diff --git a/dnn/src/cuda/query_blocksize_impl.cu b/dnn/src/cuda/query_blocksize_impl.cu new file mode 100644 index 00000000..fa0d61de --- /dev/null +++ b/dnn/src/cuda/query_blocksize_impl.cu @@ -0,0 +1,55 @@ +/** + * \file dnn/src/cuda/query_blocksize_impl.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/query_blocksize.cuh" +#include "src/cuda/utils.cuh" + +using namespace megdnn; +using namespace cuda; + +/* + * Note: cudaOccupancyMaxPotentialBlockSizeVariableSMem is only available when + * compiled by nvcc, but it is implemented as a __host__ __device__ function. So + * we implement a device wrapper + */ +namespace { + +struct SmemGetterWrapper { + SmemGetter getter; + + __device__ __host__ int operator()(int block_size) const { +#if __CUDA_ARCH__ + // device func should never be called + int* ptr = 0; + *ptr = 23; +#else + if (getter.func) { + return getter.func(block_size, getter.user_data); + } +#endif + return 0; + } +}; + +} // anonymous namespace + +LaunchConfig cuda::detail::query_launch_config_for_kernel_uncached( + const void* kern, const SmemGetter& smem) { + SmemGetterWrapper s; + s.getter = smem; + LaunchConfig ret; + cuda_check(cudaOccupancyMaxPotentialBlockSizeVariableSMem( + &ret.grid_size, &ret.block_size, kern, s)); + return ret; +} + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} + diff --git a/dnn/src/cuda/reduce/opr_impl.cpp b/dnn/src/cuda/reduce/opr_impl.cpp new file mode 100644 index 00000000..b1ec8b54 --- /dev/null +++ b/dnn/src/cuda/reduce/opr_impl.cpp @@ -0,0 +1,162 @@ +/** + * \file dnn/src/cuda/reduce/opr_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/cuda/reduce/opr_impl.h" +#include "src/cuda/reduce_helper.cuh" + +#include "src/cuda/handle.h" +#include "src/cuda/utils.h" + +#include "src/common/reduce_helper.h" + +namespace { + +using namespace megdnn; +using namespace cuda; + +template