23#if defined(_MSC_VER) && \
24 (defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM))
30#define DISPENSO_MAJOR_VERSION 1
31#define DISPENSO_MINOR_VERSION 5
32#define DISPENSO_PATCH_VERSION 1
35#if __cplusplus >= 202002L && defined(__cpp_concepts) && __cpp_concepts >= 201907L
36#define DISPENSO_HAS_CONCEPTS 1
39#define DISPENSO_HAS_CONCEPTS 0
56#if DISPENSO_HAS_CONCEPTS
57#define DISPENSO_REQUIRES(...) requires(__VA_ARGS__)
59#define DISPENSO_REQUIRES(...)
73#if __cplusplus >= 201703L
74#define DISPENSO_DEPRECATED(msg) [[deprecated(msg)]]
76#define DISPENSO_DEPRECATED(msg)
79#if defined(DISPENSO_SHARED_LIB)
82#if defined(DISPENSO_LIB_EXPORT)
83#define DISPENSO_DLL_ACCESS __declspec(dllexport)
85#define DISPENSO_DLL_ACCESS __declspec(dllimport)
88#elif defined(__clang__) || defined(__GNUC__)
89#define DISPENSO_DLL_ACCESS __attribute__((visibility("default")))
93#if !defined(DISPENSO_DLL_ACCESS)
94#define DISPENSO_DLL_ACCESS
100#if defined(__clang__)
101#define DISPENSO_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))
103#define DISPENSO_NO_THREAD_SAFETY_ANALYSIS
106using ssize_t = std::make_signed<std::size_t>::type;
108#if defined(__CUDACC__)
109#define DISPENSO_INLINE __host__ __device__ __forceinline__
110#elif defined(__clang__) || defined(__GNUC__)
111#define DISPENSO_INLINE __attribute__((always_inline)) inline
112#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
113#define DISPENSO_INLINE __forceinline
115#define DISPENSO_INLINE inline
122#if defined(__APPLE__) && defined(__arm64__)
140#define DISPENSO_CACHELINE_ALIGNED alignas(kCacheLineSize)
149#define DISPENSO_THREAD_LOCAL __declspec(thread)
150#elif defined(__GNUC__) || defined(__clang__)
151#define DISPENSO_THREAD_LOCAL __thread
153#error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available
156#if (defined(__GNUC__) || defined(__clang__))
157#define DISPENSO_EXPECT(a, b) __builtin_expect(a, b)
159#define DISPENSO_EXPECT(a, b) a
163#if (defined(__GNUC__) || defined(__clang__))
164#define DO_PRAGMA(X) _Pragma(#X)
165#define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push)
166#define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop)
167#define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName)
168#if !defined(__clang__)
169#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
170#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
172#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \
173 DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments)
174#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \
175 DISPENSO_DISABLE_WARNING(-Wglobal-constructors)
177#elif defined(_MSC_VER)
178#define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push))
179#define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop))
180#define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber))
181#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
182#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
184#define DISPENSO_DISABLE_WARNING_PUSH
185#define DISPENSO_DISABLE_WARNING_POP
186#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
187#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
208 operator const T&()
const {
220 alignas(
alignof(T))
char b[
sizeof(T)];
224struct alignas(kCacheLineSize)
AlignedAtomic :
public std::atomic<T*> {};
226inline void* alignedMalloc(
size_t bytes,
size_t alignment) {
227 alignment = std::max(alignment,
sizeof(uintptr_t));
228 char* ptr =
reinterpret_cast<char*
>(::malloc(bytes + alignment));
229 uintptr_t base =
reinterpret_cast<uintptr_t
>(ptr);
230 uintptr_t oldBase = base;
231 uintptr_t mask = alignment - 1;
235 uintptr_t* recovery =
reinterpret_cast<uintptr_t*
>(base -
sizeof(uintptr_t));
237 return reinterpret_cast<void*
>(base);
240inline void* alignedMalloc(
size_t bytes) {
241 return alignedMalloc(bytes, kCacheLineSize);
244inline void alignedFree(
void* ptr) {
248 char* p =
reinterpret_cast<char*
>(ptr);
249 uintptr_t recovered = *
reinterpret_cast<uintptr_t*
>(p -
sizeof(uintptr_t));
250 ::free(
reinterpret_cast<void*
>(recovered));
254struct AlignedFreeDeleter {
255 void operator()(T* ptr) {
257 detail::alignedFree(ptr);
261struct AlignedFreeDeleter<void> {
262 void operator()(
void* ptr) {
263 detail::alignedFree(ptr);
270struct AlignedArrayFreeDeleter {
272 void operator()(T* ptr) {
273 for (
size_t i = 0; i < count; ++i) {
276 detail::alignedFree(ptr);
283std::unique_ptr<T[], AlignedArrayFreeDeleter<T>> makeAlignedArray(
size_t n) {
284 void* raw = detail::alignedMalloc(
sizeof(T) * n,
alignof(T));
285 T* arr =
static_cast<T*
>(raw);
286 for (
size_t i = 0; i < n; ++i) {
289 return std::unique_ptr<T[], AlignedArrayFreeDeleter<T>>(arr, AlignedArrayFreeDeleter<T>{n});
293template <
typename T,
class... Args>
294std::unique_ptr<T, AlignedFreeDeleter<T>> makeAligned(Args&&... args) {
295 void* raw = detail::alignedMalloc(
sizeof(T),
alignof(T));
296 T* obj =
new (raw) T(std::forward<Args>(args)...);
297 return std::unique_ptr<T, AlignedFreeDeleter<T>>(obj);
300template <
typename T,
class... Args>
301std::shared_ptr<T> make_shared(Args&&... args) {
302 void* tv = alignedMalloc(
sizeof(T),
alignof(T));
303 T* t =
new (tv) T(std::forward<Args>(args)...);
304 return std::shared_ptr<T>(t, AlignedFreeDeleter<T>());
307inline constexpr uintptr_t alignToCacheLine(uintptr_t val) {
314#if defined __x86_64__ || defined __i386__
315inline void cpuRelax() {
316 asm volatile(
"pause" :::
"memory");
318#elif defined _MSC_VER && (defined _M_AMD64 || defined _M_IX86)
319inline void cpuRelax() {
322#elif defined __arm64__ || defined __aarch64__
323inline void cpuRelax() {
324 asm volatile(
"yield" :::
"memory");
326#elif defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM)
327inline void cpuRelax() {
330#elif defined __powerpc__ || defined __POWERPC__
332inline void cpuRelax() {
333 asm volatile(
"or r27,r27,r27" :::
"memory");
336inline void cpuRelax() {
337 asm volatile(
"or 27,27,27" :::
"memory");
342inline void cpuRelax() {}
352 ssize_t transitionTaskIndex;
353 ssize_t ceilChunkSize;
356inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) {
359 chunking.ceilChunkSize = (items + chunks - 1) / chunks;
360 ssize_t numLeft = chunking.ceilChunkSize * chunks - items;
361 chunking.transitionTaskIndex = chunks - numLeft;
371inline StaticChunking staticChunkSizeGranular(ssize_t items, ssize_t chunks, uint32_t granularity) {
373 assert(granularity >= 1);
374 if (granularity <= 1) {
375 return staticChunkSize(items, chunks);
377 assert(items %
static_cast<ssize_t
>(granularity) == 0);
380 ssize_t gUnits = items /
static_cast<ssize_t
>(granularity);
381 ssize_t ceilG = (gUnits + chunks - 1) / chunks;
382 ssize_t numLeft = ceilG * chunks - gUnits;
383 chunking.ceilChunkSize = ceilG *
static_cast<ssize_t
>(granularity);
384 chunking.transitionTaskIndex = chunks - numLeft;
detail::AlignedAtomic< T > AlignedAtomic
Cache-line aligned atomic pointer.
detail::AlignedBuffer< T > AlignedBuffer
Buffer with proper alignment for type T.
detail::StaticChunking StaticChunking
Information for statically chunking a range across threads.