dispenso 1.4.1
A library for task parallelism
Loading...
Searching...
No Matches
platform.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
14#pragma once
15#include <algorithm>
16#include <atomic>
17#include <cassert>
18#include <cstdlib>
19#include <memory>
20#include <thread>
21#include <type_traits>
22
23namespace dispenso {
24
25#define DISPENSO_MAJOR_VERSION 1
26#define DISPENSO_MINOR_VERSION 4
27#define DISPENSO_PATCH_VERSION 1
28
29#if defined(DISPENSO_SHARED_LIB)
30#if defined _WIN32
31
32#if defined(DISPENSO_LIB_EXPORT)
33#define DISPENSO_DLL_ACCESS __declspec(dllexport)
34#else
35#define DISPENSO_DLL_ACCESS __declspec(dllimport)
36#endif // DISPENSO_LIB_EXPORT
37
38#elif defined(__clang__) || defined(__GNUC__)
39#define DISPENSO_DLL_ACCESS __attribute__((visibility("default")))
40#endif // PLATFORM
41#endif // DISPENSO_SHARED_LIB
42
43#if !defined(DISPENSO_DLL_ACCESS)
44#define DISPENSO_DLL_ACCESS
45#endif // DISPENSO_DLL_ACCESS
46
47using ssize_t = std::make_signed<std::size_t>::type;
48
49#if defined(__clang__) || defined(__GNUC__)
50#define DISPENSO_INLINE __attribute__((always_inline)) inline
51#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
52#define DISPENSO_INLINE __forceinline
53#else
54#define DISPENSO_INLINE inline
55#endif // PLATFORM
56
61constexpr size_t kCacheLineSize = 64;
62
68// TODO(bbudge): Non-gcc/clang/msvc platforms.
69#if defined(_MSC_VER)
70#define DISPENSO_THREAD_LOCAL __declspec(thread)
71#elif defined(__GNUC__) || defined(__clang__)
72#define DISPENSO_THREAD_LOCAL __thread
73#else
74#error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available
75#endif
76
77#if (defined(__GNUC__) || defined(__clang__))
78#define DISPENSO_EXPECT(a, b) __builtin_expect(a, b)
79#else
80#define DISPENSO_EXPECT(a, b) a
81#endif
82
83// clang-format off
84#if (defined(__GNUC__) || defined(__clang__))
85#define DO_PRAGMA(X) _Pragma(#X)
86#define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push)
87#define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop)
88#define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName)
89#if !defined(__clang__)
90#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
91#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
92#else
93#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \
94 DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments)
95#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \
96 DISPENSO_DISABLE_WARNING(-Wglobal-constructors)
97#endif
98#elif defined(_MSC_VER)
99#define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push))
100#define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop))
101#define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber))
102#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
103#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
104#else
105#define DISPENSO_DISABLE_WARNING_PUSH
106#define DISPENSO_DISABLE_WARNING_POP
107#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
108#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
109#endif
110// clang-format on
111
112template <typename T>
114 public:
115 CacheAligned() = default;
116 CacheAligned(T t) : t_(t) {}
117 operator T&() {
118 return t_;
119 }
120
121 operator const T&() const {
122 return t_;
123 }
124
125 private:
126 alignas(kCacheLineSize) T t_;
127};
128
129namespace detail {
130
131template <typename T>
132struct AlignedBuffer {
133 alignas(alignof(T)) char b[sizeof(T)];
134};
135
136template <typename T>
137struct alignas(kCacheLineSize) AlignedAtomic : public std::atomic<T*> {};
138
139inline void* alignedMalloc(size_t bytes, size_t alignment) {
140 alignment = std::max(alignment, sizeof(uintptr_t));
141 char* ptr = reinterpret_cast<char*>(::malloc(bytes + alignment));
142 uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
143 uintptr_t oldBase = base;
144 uintptr_t mask = alignment - 1;
145 base += alignment;
146 base &= ~mask;
147
148 uintptr_t* recovery = reinterpret_cast<uintptr_t*>(base - sizeof(uintptr_t));
149 *recovery = oldBase;
150 return reinterpret_cast<void*>(base);
151}
152
153inline void* alignedMalloc(size_t bytes) {
154 return alignedMalloc(bytes, kCacheLineSize);
155}
156
157inline void alignedFree(void* ptr) {
158 if (!ptr) {
159 return;
160 }
161 char* p = reinterpret_cast<char*>(ptr);
162 uintptr_t recovered = *reinterpret_cast<uintptr_t*>(p - sizeof(uintptr_t));
163 ::free(reinterpret_cast<void*>(recovered));
164}
165
166template <typename T>
167struct AlignedFreeDeleter {
168 void operator()(T* ptr) {
169 ptr->~T();
170 detail::alignedFree(ptr);
171 }
172};
173template <>
174struct AlignedFreeDeleter<void> {
175 void operator()(void* ptr) {
176 detail::alignedFree(ptr);
177 }
178};
179
180template <typename T, class... Args>
181std::shared_ptr<T> make_shared(Args&&... args) {
182 void* tv = alignedMalloc(sizeof(T), alignof(T));
183 T* t = new (tv) T(std::forward<Args>(args)...);
184 return std::shared_ptr<T>(t, AlignedFreeDeleter<T>());
185}
186
187inline constexpr uintptr_t alignToCacheLine(uintptr_t val) {
188 constexpr uintptr_t kMask = kCacheLineSize - 1;
189 val += kMask;
190 val &= ~kMask;
191 return val;
192}
193
194#if defined __x86_64__ || defined __i386__
195inline void cpuRelax() {
196 asm volatile("pause" ::: "memory");
197}
198#elif defined __arm64__ || defined __aarch64__
199inline void cpuRelax() {
200 asm volatile("yield" ::: "memory");
201}
202#elif defined __powerpc__ || defined __POWERPC__
203#if defined __APPLE__
204inline void cpuRelax() {
205 asm volatile("or r27,r27,r27" ::: "memory");
206}
207#else
208inline void cpuRelax() {
209 asm volatile("or 27,27,27" ::: "memory");
210}
211#endif // APPLE
212#else
213// TODO: provide reasonable relax on other archs.
214inline void cpuRelax() {}
215#endif // ARCH
216
217// When statically chunking a range, it is generally not possible to use a single chunk size plus
218// remainder and get a good load distribution. By estimating too high, we can have idle threads. By
219// estimating too low, the remainder can be several times as large as the chunk for other threads.
220// Instead, we compute the chunk size that is the ceil of the fractional chunk size. That can be
221// used for the first transitionIndex values, while the remaining (chunks - transitionTaskIndex)
222// values will be ceilChunkSize - 1.
223struct StaticChunking {
224 ssize_t transitionTaskIndex;
225 ssize_t ceilChunkSize;
226};
227
228inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) {
229 assert(chunks > 0);
230 StaticChunking chunking;
231 chunking.ceilChunkSize = (items + chunks - 1) / chunks;
232 ssize_t numLeft = chunking.ceilChunkSize * chunks - items;
233 chunking.transitionTaskIndex = chunks - numLeft;
234 return chunking;
235}
236
237} // namespace detail
238} // namespace dispenso
constexpr size_t kCacheLineSize
A constant that defines a safe number of bytes+alignment to avoid false sharing.
Definition platform.h:61
detail::AlignedBuffer< T > AlignedBuffer
Buffer with proper alignment for type T.
Definition util.h:213
detail::StaticChunking StaticChunking
Information for statically chunking a range across threads.
Definition util.h:264