dispenso 1.4.1
A library for task parallelism
Loading...
Searching...
No Matches
platform.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
14#pragma once
15#include <algorithm>
16#include <atomic>
17#include <cassert>
18#include <cstdlib>
19#include <memory>
20#include <thread>
21#include <type_traits>
22
23namespace dispenso {
24
25#define DISPENSO_MAJOR_VERSION 1
26#define DISPENSO_MINOR_VERSION 4
27#define DISPENSO_PATCH_VERSION 1
28
29// C++20 concepts support detection
30#if __cplusplus >= 202002L && defined(__cpp_concepts) && __cpp_concepts >= 201907L
31#define DISPENSO_HAS_CONCEPTS 1
32#include <concepts>
33#else
34#define DISPENSO_HAS_CONCEPTS 0
35#endif
36
51#if DISPENSO_HAS_CONCEPTS
52#define DISPENSO_REQUIRES(...) requires(__VA_ARGS__)
53#else
54#define DISPENSO_REQUIRES(...)
55#endif
56
57#if defined(DISPENSO_SHARED_LIB)
58#if defined _WIN32
59
60#if defined(DISPENSO_LIB_EXPORT)
61#define DISPENSO_DLL_ACCESS __declspec(dllexport)
62#else
63#define DISPENSO_DLL_ACCESS __declspec(dllimport)
64#endif // DISPENSO_LIB_EXPORT
65
66#elif defined(__clang__) || defined(__GNUC__)
67#define DISPENSO_DLL_ACCESS __attribute__((visibility("default")))
68#endif // PLATFORM
69#endif // DISPENSO_SHARED_LIB
70
71#if !defined(DISPENSO_DLL_ACCESS)
72#define DISPENSO_DLL_ACCESS
73#endif // DISPENSO_DLL_ACCESS
74
75using ssize_t = std::make_signed<std::size_t>::type;
76
77#if defined(__clang__) || defined(__GNUC__)
78#define DISPENSO_INLINE __attribute__((always_inline)) inline
79#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
80#define DISPENSO_INLINE __forceinline
81#else
82#define DISPENSO_INLINE inline
83#endif // PLATFORM
84
89constexpr size_t kCacheLineSize = 64;
90
96// TODO(bbudge): Non-gcc/clang/msvc platforms.
97#if defined(_MSC_VER)
98#define DISPENSO_THREAD_LOCAL __declspec(thread)
99#elif defined(__GNUC__) || defined(__clang__)
100#define DISPENSO_THREAD_LOCAL __thread
101#else
102#error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available
103#endif
104
105#if (defined(__GNUC__) || defined(__clang__))
106#define DISPENSO_EXPECT(a, b) __builtin_expect(a, b)
107#else
108#define DISPENSO_EXPECT(a, b) a
109#endif
110
111// clang-format off
112#if (defined(__GNUC__) || defined(__clang__))
113#define DO_PRAGMA(X) _Pragma(#X)
114#define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push)
115#define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop)
116#define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName)
117#if !defined(__clang__)
118#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
119#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
120#else
121#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \
122 DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments)
123#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \
124 DISPENSO_DISABLE_WARNING(-Wglobal-constructors)
125#endif
126#elif defined(_MSC_VER)
127#define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push))
128#define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop))
129#define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber))
130#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
131#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
132#else
133#define DISPENSO_DISABLE_WARNING_PUSH
134#define DISPENSO_DISABLE_WARNING_POP
135#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
136#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
137#endif
138// clang-format on
139
147template <typename T>
149 public:
150 CacheAligned() = default;
152 CacheAligned(T t) : t_(t) {}
153 operator T&() {
154 return t_;
155 }
156
157 operator const T&() const {
158 return t_;
159 }
160
161 private:
162 alignas(kCacheLineSize) T t_;
163};
164
165namespace detail {
166
167template <typename T>
168struct AlignedBuffer {
169 alignas(alignof(T)) char b[sizeof(T)];
170};
171
172template <typename T>
173struct alignas(kCacheLineSize) AlignedAtomic : public std::atomic<T*> {};
174
175inline void* alignedMalloc(size_t bytes, size_t alignment) {
176 alignment = std::max(alignment, sizeof(uintptr_t));
177 char* ptr = reinterpret_cast<char*>(::malloc(bytes + alignment));
178 uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
179 uintptr_t oldBase = base;
180 uintptr_t mask = alignment - 1;
181 base += alignment;
182 base &= ~mask;
183
184 uintptr_t* recovery = reinterpret_cast<uintptr_t*>(base - sizeof(uintptr_t));
185 *recovery = oldBase;
186 return reinterpret_cast<void*>(base);
187}
188
189inline void* alignedMalloc(size_t bytes) {
190 return alignedMalloc(bytes, kCacheLineSize);
191}
192
193inline void alignedFree(void* ptr) {
194 if (!ptr) {
195 return;
196 }
197 char* p = reinterpret_cast<char*>(ptr);
198 uintptr_t recovered = *reinterpret_cast<uintptr_t*>(p - sizeof(uintptr_t));
199 ::free(reinterpret_cast<void*>(recovered));
200}
201
202template <typename T>
203struct AlignedFreeDeleter {
204 void operator()(T* ptr) {
205 ptr->~T();
206 detail::alignedFree(ptr);
207 }
208};
209template <>
210struct AlignedFreeDeleter<void> {
211 void operator()(void* ptr) {
212 detail::alignedFree(ptr);
213 }
214};
215
216template <typename T, class... Args>
217std::shared_ptr<T> make_shared(Args&&... args) {
218 void* tv = alignedMalloc(sizeof(T), alignof(T));
219 T* t = new (tv) T(std::forward<Args>(args)...);
220 return std::shared_ptr<T>(t, AlignedFreeDeleter<T>());
221}
222
223inline constexpr uintptr_t alignToCacheLine(uintptr_t val) {
224 constexpr uintptr_t kMask = kCacheLineSize - 1;
225 val += kMask;
226 val &= ~kMask;
227 return val;
228}
229
230#if defined __x86_64__ || defined __i386__
231inline void cpuRelax() {
232 asm volatile("pause" ::: "memory");
233}
234#elif defined __arm64__ || defined __aarch64__
235inline void cpuRelax() {
236 asm volatile("yield" ::: "memory");
237}
238#elif defined __powerpc__ || defined __POWERPC__
239#if defined __APPLE__
240inline void cpuRelax() {
241 asm volatile("or r27,r27,r27" ::: "memory");
242}
243#else
244inline void cpuRelax() {
245 asm volatile("or 27,27,27" ::: "memory");
246}
247#endif // APPLE
248#else
249// TODO: provide reasonable relax on other archs.
250inline void cpuRelax() {}
251#endif // ARCH
252
253// When statically chunking a range, it is generally not possible to use a single chunk size plus
254// remainder and get a good load distribution. By estimating too high, we can have idle threads. By
255// estimating too low, the remainder can be several times as large as the chunk for other threads.
256// Instead, we compute the chunk size that is the ceil of the fractional chunk size. That can be
257// used for the first transitionIndex values, while the remaining (chunks - transitionTaskIndex)
258// values will be ceilChunkSize - 1.
259struct StaticChunking {
260 ssize_t transitionTaskIndex;
261 ssize_t ceilChunkSize;
262};
263
264inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) {
265 assert(chunks > 0);
266 StaticChunking chunking;
267 chunking.ceilChunkSize = (items + chunks - 1) / chunks;
268 ssize_t numLeft = chunking.ceilChunkSize * chunks - items;
269 chunking.transitionTaskIndex = chunks - numLeft;
270 return chunking;
271}
272
273} // namespace detail
274} // namespace dispenso
constexpr size_t kCacheLineSize
A constant that defines a safe number of bytes+alignment to avoid false sharing.
Definition platform.h:89
detail::AlignedAtomic< T > AlignedAtomic
Cache-line aligned atomic pointer.
Definition util.h:230
detail::AlignedBuffer< T > AlignedBuffer
Buffer with proper alignment for type T.
Definition util.h:213
detail::StaticChunking StaticChunking
Information for statically chunking a range across threads.
Definition util.h:264