/* * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #ifndef _CUDA_AWBARRIER_HELPERS_H_ # define _CUDA_AWBARRIER_HELPERS_H_ # define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental # define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental { # define _CUDA_AWBARRIER_END_NAMESPACE } } # define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal # define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal { # define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE # if !defined(_CUDA_AWBARRIER_QUALIFIER) # define _CUDA_AWBARRIER_QUALIFIER inline __device__ # endif # if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER) # define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__ # endif # if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) # define _CUDA_AWBARRIER_ARCH_700_OR_LATER # endif # define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1) # if (__CUDA_ARCH__ >= 800) # define _CUDA_AWBARRIER_HAS_HW_MBARRIER 1 # else # define _CUDA_AWBARRIER_HAS_HW_MBARRIER 0 # endif # if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900))) # define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER # endif # if !defined(_CUDA_AWBARRIER_DEBUG) # if defined(__CUDACC_DEBUG__) # define _CUDA_AWBARRIER_DEBUG 1 # else # define _CUDA_AWBARRIER_DEBUG 0 # endif # endif # if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG) # if !defined(__CUDACC_RTC__) # include # endif # define _CUDA_AWBARRIER_ASSERT(x) assert((x)); # define _CUDA_AWBARRIER_ABORT() assert(0); # else # define _CUDA_AWBARRIER_ASSERT(x) # define _CUDA_AWBARRIER_ABORT() __trap(); # endif # if defined(_MSC_VER) && !defined(_WIN64) # define _CUDA_AWBARRIER_ASM_PTR_CONSTRAINT "r" # else # define _CUDA_AWBARRIER_ASM_PTR_CONSTRAINT "l" # endif # if defined(__CUDACC_RTC__) typedef unsigned short uint16_t; typedef unsigned int uint32_t; typedef unsigned long long uint64_t; typedef uint64_t uintptr_t; # else # include # endif # if defined(_CUDA_AWBARRIER_ARCH_700_OR_LATER) _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *); template struct ImplementationChooser; template<> struct ImplementationChooser { _CUDA_AWBARRIER_STATIC_QUALIFIER void awbarrier_init(uint64_t* barrier, uint32_t expected_count) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29)); asm volatile ("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count) : "memory"); } _CUDA_AWBARRIER_STATIC_QUALIFIER void awbarrier_inval(uint64_t* barrier) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); asm volatile ("mbarrier.inval.shared.b64 [%0];" : : "r"(__nvvm_get_smem_pointer(barrier)) : "memory"); } _CUDA_AWBARRIER_STATIC_QUALIFIER uint32_t awbarrier_token_pending_count(uint64_t token) { uint32_t pending_count; asm ("mbarrier.pending_count.b64 %0, %1;" : "=r"(pending_count) : "l"(token)); return pending_count; } template _CUDA_AWBARRIER_STATIC_QUALIFIER uint64_t awbarrier_arrive_drop(uint64_t* barrier) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); uint64_t token; if (Drop) { asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];" : "=l"(token) : "r"(__nvvm_get_smem_pointer(barrier)) : "memory"); } else { asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(token) : "r"(__nvvm_get_smem_pointer(barrier)) : "memory"); } return token; } template _CUDA_AWBARRIER_STATIC_QUALIFIER uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29)); uint64_t token; if (Drop) { asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;" : "=l"(token) : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count) : "memory"); } else { asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" : "=l"(token) : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count) : "memory"); } return token; } _CUDA_AWBARRIER_STATIC_QUALIFIER bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); uint16_t wait_complete; asm volatile ("{" " .reg .pred %%p;" " mbarrier.test_wait.shared.b64 %%p, [%1], %2;" " selp.u16 %0, 1, 0, %%p;" "}" : "=h"(wait_complete) : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token) : "memory"); return bool(wait_complete); } }; template<> struct ImplementationChooser { union AWBarrier { struct { uint32_t expected; uint32_t pending; } split; uint64_t raw; }; _CUDA_AWBARRIER_STATIC_QUALIFIER void awbarrier_init(uint64_t* barrier, uint32_t expected_count) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29)); AWBarrier* awbarrier = reinterpret_cast(barrier); awbarrier->split.expected = 0x40000000 - expected_count; awbarrier->split.pending = 0x80000000 - expected_count; } _CUDA_AWBARRIER_STATIC_QUALIFIER void awbarrier_inval(uint64_t* barrier) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); } _CUDA_AWBARRIER_STATIC_QUALIFIER uint32_t awbarrier_token_pending_count(uint64_t token) { const uint32_t pending = token >> 32; return 0x80000000 - (pending & 0x7fffffff); } template _CUDA_AWBARRIER_STATIC_QUALIFIER uint64_t awbarrier_arrive_drop(uint64_t* barrier) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); AWBarrier* awbarrier = reinterpret_cast(barrier); while ((*reinterpret_cast(&awbarrier->split.pending) & 0x7fffffff) == 0); if (Drop) { (void)atomicAdd_block(&awbarrier->split.expected, 1); } __threadfence_block(); const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1); const uint32_t new_pending = old_pending + 1; const bool reset = (old_pending ^ new_pending) & 0x80000000; if (reset) { __threadfence_block(); uint32_t new_expected = *reinterpret_cast(&awbarrier->split.expected); new_expected &= ~0x40000000; if (new_expected & 0x20000000) { new_expected |= 0x40000000; } atomicAdd_block(&awbarrier->split.pending, new_expected); } return static_cast(old_pending) << 32; } template _CUDA_AWBARRIER_STATIC_QUALIFIER uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29)); AWBarrier* awbarrier = reinterpret_cast(barrier); while ((*reinterpret_cast(&awbarrier->split.pending) & 0x7fffffff) == 0); if (Drop) { (void)atomicAdd_block(&awbarrier->split.expected, count); } return static_cast(atomicAdd_block(&awbarrier->split.pending, count)) << 32; } _CUDA_AWBARRIER_STATIC_QUALIFIER bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) { _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); volatile AWBarrier* awbarrier = reinterpret_cast(barrier); return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000; } }; _CUDA_AWBARRIER_QUALIFIER void awbarrier_init(uint64_t* barrier, uint32_t expected_count) { ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_init(barrier, expected_count); } _CUDA_AWBARRIER_QUALIFIER void awbarrier_inval(uint64_t* barrier) { ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_inval(barrier); } _CUDA_AWBARRIER_QUALIFIER uint32_t awbarrier_token_pending_count(uint64_t token) { return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_token_pending_count(token); } template _CUDA_AWBARRIER_QUALIFIER uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count) { return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_arrive_drop_no_complete(barrier, arrive_count); } template _CUDA_AWBARRIER_QUALIFIER uint64_t awbarrier_arrive_drop(uint64_t* barrier) { return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_arrive_drop(barrier); } _CUDA_AWBARRIER_QUALIFIER bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) { return ImplementationChooser<_CUDA_AWBARRIER_HAS_HW_MBARRIER>::awbarrier_test_wait(barrier, token); } _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE # endif /* !_CUDA_AWBARRIER_ARCH_700_OR_LATER */ #endif /* !_CUDA_AWBARRIER_HELPERS_H_ */