dispenso 1.5.0
A library for task parallelism
Loading...
Searching...
No Matches
platform.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
14#pragma once
15#include <algorithm>
16#include <atomic>
17#include <cassert>
18#include <cstdlib>
19#include <memory>
20#include <thread>
21#include <type_traits>
22
23#if defined(_MSC_VER) && \
24 (defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM))
25#include <intrin.h>
26#endif
27
28namespace dispenso {
29
30#define DISPENSO_MAJOR_VERSION 1
31#define DISPENSO_MINOR_VERSION 4
32#define DISPENSO_PATCH_VERSION 1
33
34// C++20 concepts support detection
35#if __cplusplus >= 202002L && defined(__cpp_concepts) && __cpp_concepts >= 201907L
36#define DISPENSO_HAS_CONCEPTS 1
37#include <concepts>
38#else
39#define DISPENSO_HAS_CONCEPTS 0
40#endif
41
56#if DISPENSO_HAS_CONCEPTS
57#define DISPENSO_REQUIRES(...) requires(__VA_ARGS__)
58#else
59#define DISPENSO_REQUIRES(...)
60#endif
61
62#if defined(DISPENSO_SHARED_LIB)
63#if defined _WIN32
64
65#if defined(DISPENSO_LIB_EXPORT)
66#define DISPENSO_DLL_ACCESS __declspec(dllexport)
67#else
68#define DISPENSO_DLL_ACCESS __declspec(dllimport)
69#endif // DISPENSO_LIB_EXPORT
70
71#elif defined(__clang__) || defined(__GNUC__)
72#define DISPENSO_DLL_ACCESS __attribute__((visibility("default")))
73#endif // PLATFORM
74#endif // DISPENSO_SHARED_LIB
75
76#if !defined(DISPENSO_DLL_ACCESS)
77#define DISPENSO_DLL_ACCESS
78#endif // DISPENSO_DLL_ACCESS
79
80using ssize_t = std::make_signed<std::size_t>::type;
81
82#if defined(__clang__) || defined(__GNUC__)
83#define DISPENSO_INLINE __attribute__((always_inline)) inline
84#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
85#define DISPENSO_INLINE __forceinline
86#else
87#define DISPENSO_INLINE inline
88#endif // PLATFORM
89
94#if defined(__APPLE__) && defined(__arm64__)
95constexpr size_t kCacheLineSize = 128;
96#else
97constexpr size_t kCacheLineSize = 64;
98#endif
99
105// TODO(bbudge): Non-gcc/clang/msvc platforms.
106#if defined(_MSC_VER)
107#define DISPENSO_THREAD_LOCAL __declspec(thread)
108#elif defined(__GNUC__) || defined(__clang__)
109#define DISPENSO_THREAD_LOCAL __thread
110#else
111#error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available
112#endif
113
114#if (defined(__GNUC__) || defined(__clang__))
115#define DISPENSO_EXPECT(a, b) __builtin_expect(a, b)
116#else
117#define DISPENSO_EXPECT(a, b) a
118#endif
119
120// clang-format off
121#if (defined(__GNUC__) || defined(__clang__))
122#define DO_PRAGMA(X) _Pragma(#X)
123#define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push)
124#define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop)
125#define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName)
126#if !defined(__clang__)
127#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
128#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
129#else
130#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \
131 DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments)
132#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \
133 DISPENSO_DISABLE_WARNING(-Wglobal-constructors)
134#endif
135#elif defined(_MSC_VER)
136#define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push))
137#define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop))
138#define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber))
139#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
140#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
141#else
142#define DISPENSO_DISABLE_WARNING_PUSH
143#define DISPENSO_DISABLE_WARNING_POP
144#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
145#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
146#endif
147// clang-format on
148
156template <typename T>
158 public:
159 CacheAligned() = default;
161 CacheAligned(T t) : t_(t) {}
162 operator T&() {
163 return t_;
164 }
165
166 operator const T&() const {
167 return t_;
168 }
169
170 private:
171 alignas(kCacheLineSize) T t_;
172};
173
174namespace detail {
175
176template <typename T>
177struct AlignedBuffer {
178 alignas(alignof(T)) char b[sizeof(T)];
179};
180
181template <typename T>
182struct alignas(kCacheLineSize) AlignedAtomic : public std::atomic<T*> {};
183
184inline void* alignedMalloc(size_t bytes, size_t alignment) {
185 alignment = std::max(alignment, sizeof(uintptr_t));
186 char* ptr = reinterpret_cast<char*>(::malloc(bytes + alignment));
187 uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
188 uintptr_t oldBase = base;
189 uintptr_t mask = alignment - 1;
190 base += alignment;
191 base &= ~mask;
192
193 uintptr_t* recovery = reinterpret_cast<uintptr_t*>(base - sizeof(uintptr_t));
194 *recovery = oldBase;
195 return reinterpret_cast<void*>(base);
196}
197
198inline void* alignedMalloc(size_t bytes) {
199 return alignedMalloc(bytes, kCacheLineSize);
200}
201
202inline void alignedFree(void* ptr) {
203 if (!ptr) {
204 return;
205 }
206 char* p = reinterpret_cast<char*>(ptr);
207 uintptr_t recovered = *reinterpret_cast<uintptr_t*>(p - sizeof(uintptr_t));
208 ::free(reinterpret_cast<void*>(recovered));
209}
210
211template <typename T>
212struct AlignedFreeDeleter {
213 void operator()(T* ptr) {
214 ptr->~T();
215 detail::alignedFree(ptr);
216 }
217};
218template <>
219struct AlignedFreeDeleter<void> {
220 void operator()(void* ptr) {
221 detail::alignedFree(ptr);
222 }
223};
224
225template <typename T, class... Args>
226std::shared_ptr<T> make_shared(Args&&... args) {
227 void* tv = alignedMalloc(sizeof(T), alignof(T));
228 T* t = new (tv) T(std::forward<Args>(args)...);
229 return std::shared_ptr<T>(t, AlignedFreeDeleter<T>());
230}
231
232inline constexpr uintptr_t alignToCacheLine(uintptr_t val) {
233 constexpr uintptr_t kMask = kCacheLineSize - 1;
234 val += kMask;
235 val &= ~kMask;
236 return val;
237}
238
239#if defined __x86_64__ || defined __i386__
240inline void cpuRelax() {
241 asm volatile("pause" ::: "memory");
242}
243#elif defined _MSC_VER && (defined _M_AMD64 || defined _M_IX86)
244inline void cpuRelax() {
245 _mm_pause();
246}
247#elif defined __arm64__ || defined __aarch64__
248inline void cpuRelax() {
249 asm volatile("yield" ::: "memory");
250}
251#elif defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM)
252inline void cpuRelax() {
253 __yield();
254}
255#elif defined __powerpc__ || defined __POWERPC__
256#if defined __APPLE__
257inline void cpuRelax() {
258 asm volatile("or r27,r27,r27" ::: "memory");
259}
260#else
261inline void cpuRelax() {
262 asm volatile("or 27,27,27" ::: "memory");
263}
264#endif // APPLE
265#else
266// TODO: provide reasonable relax on other archs.
267inline void cpuRelax() {}
268#endif // ARCH
269
270// When statically chunking a range, it is generally not possible to use a single chunk size plus
271// remainder and get a good load distribution. By estimating too high, we can have idle threads. By
272// estimating too low, the remainder can be several times as large as the chunk for other threads.
273// Instead, we compute the chunk size that is the ceil of the fractional chunk size. That can be
274// used for the first transitionIndex values, while the remaining (chunks - transitionTaskIndex)
275// values will be ceilChunkSize - 1.
276struct StaticChunking {
277 ssize_t transitionTaskIndex;
278 ssize_t ceilChunkSize;
279};
280
281inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) {
282 assert(chunks > 0);
283 StaticChunking chunking;
284 chunking.ceilChunkSize = (items + chunks - 1) / chunks;
285 ssize_t numLeft = chunking.ceilChunkSize * chunks - items;
286 chunking.transitionTaskIndex = chunks - numLeft;
287 return chunking;
288}
289
290} // namespace detail
291} // namespace dispenso
constexpr size_t kCacheLineSize
A constant that defines a safe number of bytes+alignment to avoid false sharing.
Definition platform.h:97
detail::AlignedAtomic< T > AlignedAtomic
Cache-line aligned atomic pointer.
Definition util.h:230
detail::AlignedBuffer< T > AlignedBuffer
Buffer with proper alignment for type T.
Definition util.h:213
detail::StaticChunking StaticChunking
Information for statically chunking a range across threads.
Definition util.h:264