dispenso 1.6.0
A library for task parallelism
Loading...
Searching...
No Matches
platform.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
14#pragma once
15#include <algorithm>
16#include <atomic>
17#include <cassert>
18#include <cstdlib>
19#include <memory>
20#include <thread>
21#include <type_traits>
22
23#if defined(_MSC_VER) && \
24 (defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM))
25#include <intrin.h>
26#endif
27
28namespace dispenso {
29
30#define DISPENSO_MAJOR_VERSION 1
31#define DISPENSO_MINOR_VERSION 5
32#define DISPENSO_PATCH_VERSION 1
33
34// C++20 concepts support detection
35#if __cplusplus >= 202002L && defined(__cpp_concepts) && __cpp_concepts >= 201907L
36#define DISPENSO_HAS_CONCEPTS 1
37#include <concepts>
38#else
39#define DISPENSO_HAS_CONCEPTS 0
40#endif
41
56#if DISPENSO_HAS_CONCEPTS
57#define DISPENSO_REQUIRES(...) requires(__VA_ARGS__)
58#else
59#define DISPENSO_REQUIRES(...)
60#endif
61
73#if __cplusplus >= 201703L
74#define DISPENSO_DEPRECATED(msg) [[deprecated(msg)]]
75#else
76#define DISPENSO_DEPRECATED(msg)
77#endif
78
79#if defined(DISPENSO_SHARED_LIB)
80#if defined _WIN32
81
82#if defined(DISPENSO_LIB_EXPORT)
83#define DISPENSO_DLL_ACCESS __declspec(dllexport)
84#else
85#define DISPENSO_DLL_ACCESS __declspec(dllimport)
86#endif // DISPENSO_LIB_EXPORT
87
88#elif defined(__clang__) || defined(__GNUC__)
89#define DISPENSO_DLL_ACCESS __attribute__((visibility("default")))
90#endif // PLATFORM
91#endif // DISPENSO_SHARED_LIB
92
93#if !defined(DISPENSO_DLL_ACCESS)
94#define DISPENSO_DLL_ACCESS
95#endif // DISPENSO_DLL_ACCESS
96
97// Suppresses Clang thread-safety-analysis warnings for a single function.
98// Expands to the attribute on Clang; a no-op on all other compilers (MSVC, GCC, etc.)
99// that do not support thread-safety analysis.
100#if defined(__clang__)
101#define DISPENSO_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))
102#else
103#define DISPENSO_NO_THREAD_SAFETY_ANALYSIS
104#endif
105
106using ssize_t = std::make_signed<std::size_t>::type;
107
108#if defined(__CUDACC__)
109#define DISPENSO_INLINE __host__ __device__ __forceinline__
110#elif defined(__clang__) || defined(__GNUC__)
111#define DISPENSO_INLINE __attribute__((always_inline)) inline
112#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
113#define DISPENSO_INLINE __forceinline
114#else
115#define DISPENSO_INLINE inline
116#endif // PLATFORM
117
122#if defined(__APPLE__) && defined(__arm64__)
123constexpr size_t kCacheLineSize = 128;
124#else
125constexpr size_t kCacheLineSize = 64;
126#endif
127
140#define DISPENSO_CACHELINE_ALIGNED alignas(kCacheLineSize)
141
147// TODO(bbudge): Non-gcc/clang/msvc platforms.
148#if defined(_MSC_VER)
149#define DISPENSO_THREAD_LOCAL __declspec(thread)
150#elif defined(__GNUC__) || defined(__clang__)
151#define DISPENSO_THREAD_LOCAL __thread
152#else
153#error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available
154#endif
155
156#if (defined(__GNUC__) || defined(__clang__))
157#define DISPENSO_EXPECT(a, b) __builtin_expect(a, b)
158#else
159#define DISPENSO_EXPECT(a, b) a
160#endif
161
162// clang-format off
163#if (defined(__GNUC__) || defined(__clang__))
164#define DO_PRAGMA(X) _Pragma(#X)
165#define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push)
166#define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop)
167#define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName)
168#if !defined(__clang__)
169#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
170#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
171#else
172#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \
173 DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments)
174#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \
175 DISPENSO_DISABLE_WARNING(-Wglobal-constructors)
176#endif
177#elif defined(_MSC_VER)
178#define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push))
179#define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop))
180#define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber))
181#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
182#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
183#else
184#define DISPENSO_DISABLE_WARNING_PUSH
185#define DISPENSO_DISABLE_WARNING_POP
186#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
187#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
188#endif
189// clang-format on
190
198template <typename T>
200 public:
201 CacheAligned() = default;
203 CacheAligned(T t) : t_(t) {}
204 operator T&() {
205 return t_;
206 }
207
208 operator const T&() const {
209 return t_;
210 }
211
212 private:
213 alignas(kCacheLineSize) T t_;
214};
215
216namespace detail {
217
218template <typename T>
219struct AlignedBuffer {
220 alignas(alignof(T)) char b[sizeof(T)];
221};
222
223template <typename T>
224struct alignas(kCacheLineSize) AlignedAtomic : public std::atomic<T*> {};
225
226inline void* alignedMalloc(size_t bytes, size_t alignment) {
227 alignment = std::max(alignment, sizeof(uintptr_t));
228 char* ptr = reinterpret_cast<char*>(::malloc(bytes + alignment));
229 uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
230 uintptr_t oldBase = base;
231 uintptr_t mask = alignment - 1;
232 base += alignment;
233 base &= ~mask;
234
235 uintptr_t* recovery = reinterpret_cast<uintptr_t*>(base - sizeof(uintptr_t));
236 *recovery = oldBase;
237 return reinterpret_cast<void*>(base);
238}
239
240inline void* alignedMalloc(size_t bytes) {
241 return alignedMalloc(bytes, kCacheLineSize);
242}
243
244inline void alignedFree(void* ptr) {
245 if (!ptr) {
246 return;
247 }
248 char* p = reinterpret_cast<char*>(ptr);
249 uintptr_t recovered = *reinterpret_cast<uintptr_t*>(p - sizeof(uintptr_t));
250 ::free(reinterpret_cast<void*>(recovered));
251}
252
253template <typename T>
254struct AlignedFreeDeleter {
255 void operator()(T* ptr) {
256 ptr->~T();
257 detail::alignedFree(ptr);
258 }
259};
260template <>
261struct AlignedFreeDeleter<void> {
262 void operator()(void* ptr) {
263 detail::alignedFree(ptr);
264 }
265};
266
267// Array deleter for aligned allocations. Destructor loop is elided by
268// the compiler for trivially destructible types.
269template <typename T>
270struct AlignedArrayFreeDeleter {
271 size_t count;
272 void operator()(T* ptr) {
273 for (size_t i = 0; i < count; ++i) {
274 ptr[i].~T();
275 }
276 detail::alignedFree(ptr);
277 }
278};
279
280// Allocate a value-initialized array of T with alignof(T) alignment.
281// Constructor/destructor loops are elided for trivial types.
282template <typename T>
283std::unique_ptr<T[], AlignedArrayFreeDeleter<T>> makeAlignedArray(size_t n) {
284 void* raw = detail::alignedMalloc(sizeof(T) * n, alignof(T));
285 T* arr = static_cast<T*>(raw);
286 for (size_t i = 0; i < n; ++i) {
287 new (&arr[i]) T();
288 }
289 return std::unique_ptr<T[], AlignedArrayFreeDeleter<T>>(arr, AlignedArrayFreeDeleter<T>{n});
290}
291
292// Allocate a single object of T with alignof(T) alignment.
293template <typename T, class... Args>
294std::unique_ptr<T, AlignedFreeDeleter<T>> makeAligned(Args&&... args) {
295 void* raw = detail::alignedMalloc(sizeof(T), alignof(T));
296 T* obj = new (raw) T(std::forward<Args>(args)...);
297 return std::unique_ptr<T, AlignedFreeDeleter<T>>(obj);
298}
299
300template <typename T, class... Args>
301std::shared_ptr<T> make_shared(Args&&... args) {
302 void* tv = alignedMalloc(sizeof(T), alignof(T));
303 T* t = new (tv) T(std::forward<Args>(args)...);
304 return std::shared_ptr<T>(t, AlignedFreeDeleter<T>());
305}
306
307inline constexpr uintptr_t alignToCacheLine(uintptr_t val) {
308 constexpr uintptr_t kMask = kCacheLineSize - 1;
309 val += kMask;
310 val &= ~kMask;
311 return val;
312}
313
314#if defined __x86_64__ || defined __i386__
315inline void cpuRelax() {
316 asm volatile("pause" ::: "memory");
317}
318#elif defined _MSC_VER && (defined _M_AMD64 || defined _M_IX86)
319inline void cpuRelax() {
320 _mm_pause();
321}
322#elif defined __arm64__ || defined __aarch64__
323inline void cpuRelax() {
324 asm volatile("yield" ::: "memory");
325}
326#elif defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM)
327inline void cpuRelax() {
328 __yield();
329}
330#elif defined __powerpc__ || defined __POWERPC__
331#if defined __APPLE__
332inline void cpuRelax() {
333 asm volatile("or r27,r27,r27" ::: "memory");
334}
335#else
336inline void cpuRelax() {
337 asm volatile("or 27,27,27" ::: "memory");
338}
339#endif // APPLE
340#else
341// TODO: provide reasonable relax on other archs.
342inline void cpuRelax() {}
343#endif // ARCH
344
345// When statically chunking a range, it is generally not possible to use a single chunk size plus
346// remainder and get a good load distribution. By estimating too high, we can have idle threads. By
347// estimating too low, the remainder can be several times as large as the chunk for other threads.
348// Instead, we compute the chunk size that is the ceil of the fractional chunk size. That can be
349// used for the first transitionIndex values, while the remaining (chunks - transitionTaskIndex)
350// values will be ceilChunkSize - 1.
351struct StaticChunking {
352 ssize_t transitionTaskIndex;
353 ssize_t ceilChunkSize;
354};
355
356inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) {
357 assert(chunks > 0);
358 StaticChunking chunking;
359 chunking.ceilChunkSize = (items + chunks - 1) / chunks;
360 ssize_t numLeft = chunking.ceilChunkSize * chunks - items;
361 chunking.transitionTaskIndex = chunks - numLeft;
362 return chunking;
363}
364
365// Granularity-aware variant: ceilChunkSize is rounded UP to a multiple of
366// `granularity`, so each "ceil" chunk is granularity-aligned. The "floor"
367// chunks (those at index >= transitionTaskIndex) are ceilChunkSize - granularity,
368// also granularity-aligned. Caller must have already trimmed `items` to a
369// multiple of `granularity` so that all chunks (not just intermediate ones)
370// are granularity-multiples.
371inline StaticChunking staticChunkSizeGranular(ssize_t items, ssize_t chunks, uint32_t granularity) {
372 assert(chunks > 0);
373 assert(granularity >= 1);
374 if (granularity <= 1) {
375 return staticChunkSize(items, chunks);
376 }
377 assert(items % static_cast<ssize_t>(granularity) == 0);
378 StaticChunking chunking;
379 // Items measured in "granularity units".
380 ssize_t gUnits = items / static_cast<ssize_t>(granularity);
381 ssize_t ceilG = (gUnits + chunks - 1) / chunks;
382 ssize_t numLeft = ceilG * chunks - gUnits;
383 chunking.ceilChunkSize = ceilG * static_cast<ssize_t>(granularity);
384 chunking.transitionTaskIndex = chunks - numLeft;
385 return chunking;
386}
387
388} // namespace detail
389} // namespace dispenso
constexpr size_t kCacheLineSize
A constant that defines a safe number of bytes+alignment to avoid false sharing.
Definition platform.h:125
detail::AlignedAtomic< T > AlignedAtomic
Cache-line aligned atomic pointer.
Definition util.h:230
detail::AlignedBuffer< T > AlignedBuffer
Buffer with proper alignment for type T.
Definition util.h:213
detail::StaticChunking StaticChunking
Information for statically chunking a range across threads.
Definition util.h:264