dispenso
A library for task parallelism
 
Loading...
Searching...
No Matches
platform.h
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
12#pragma once
13#include <algorithm>
14#include <atomic>
15#include <cassert>
16#include <cstdlib>
17#include <memory>
18#include <thread>
19#include <type_traits>
20
21namespace dispenso {
22
23#define DISPENSO_MAJOR_VERSION 1
24#define DISPENSO_MINOR_VERSION 3
25
26#if defined(DISPENSO_SHARED_LIB)
27#if defined _WIN32
28
29#if defined(DISPENSO_LIB_EXPORT)
30#define DISPENSO_DLL_ACCESS __declspec(dllexport)
31#else
32#define DISPENSO_DLL_ACCESS __declspec(dllimport)
33#endif // DISPENSO_LIB_EXPORT
34
35#elif defined(__clang__) || defined(__GNUC__)
36#define DISPENSO_DLL_ACCESS __attribute__((visibility("default")))
37#endif // PLATFORM
38#endif // DISPENSO_SHARED_LIB
39
40#if !defined(DISPENSO_DLL_ACCESS)
41#define DISPENSO_DLL_ACCESS
42#endif // DISPENSO_DLL_ACCESS
43
44using ssize_t = std::make_signed<std::size_t>::type;
45
46#if defined(__clang__) || defined(__GNUC__)
47#define DISPENSO_INLINE __attribute__((always_inline)) inline
48#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
49#define DISPENSO_INLINE __forceinline
50#else
51#define DISPENSO_INLINE inline
52#endif // PLATFORM
53
58constexpr size_t kCacheLineSize = 64;
59
65// TODO(bbudge): Non-gcc/clang/msvc platforms.
66#if defined(_MSC_VER)
67#define DISPENSO_THREAD_LOCAL __declspec(thread)
68#elif defined(__GNUC__) || defined(__clang__)
69#define DISPENSO_THREAD_LOCAL __thread
70#else
71#error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available
72#endif
73
74#if (defined(__GNUC__) || defined(__clang__))
75#define DISPENSO_EXPECT(a, b) __builtin_expect(a, b)
76#else
77#define DISPENSO_EXPECT(a, b) a
78#endif
79
80// clang-format off
81#if (defined(__GNUC__) || defined(__clang__))
82#define DO_PRAGMA(X) _Pragma(#X)
83#define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push)
84#define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop)
85#define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName)
86#if !defined(__clang__)
87#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
88#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
89#else
90#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \
91 DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments)
92#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \
93 DISPENSO_DISABLE_WARNING(-Wglobal-constructors)
94#endif
95#elif defined(_MSC_VER)
96#define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push))
97#define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop))
98#define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber))
99#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
100#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
101#else
102#define DISPENSO_DISABLE_WARNING_PUSH
103#define DISPENSO_DISABLE_WARNING_POP
104#define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS
105#define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS
106#endif
107// clang-format on
108
109template <typename T>
111 public:
112 CacheAligned() = default;
113 CacheAligned(T t) : t_(t) {}
114 operator T&() {
115 return t_;
116 }
117
118 operator const T&() const {
119 return t_;
120 }
121
122 private:
123 alignas(kCacheLineSize) T t_;
124};
125
126namespace detail {
127
128template <typename T>
129struct AlignedBuffer {
130 alignas(alignof(T)) char b[sizeof(T)];
131};
132
133template <typename T>
134struct alignas(kCacheLineSize) AlignedAtomic : public std::atomic<T*> {};
135
136inline void* alignedMalloc(size_t bytes, size_t alignment) {
137 alignment = std::max(alignment, sizeof(uintptr_t));
138 char* ptr = reinterpret_cast<char*>(::malloc(bytes + alignment));
139 uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
140 uintptr_t oldBase = base;
141 uintptr_t mask = alignment - 1;
142 base += alignment;
143 base &= ~mask;
144
145 uintptr_t* recovery = reinterpret_cast<uintptr_t*>(base - sizeof(uintptr_t));
146 *recovery = oldBase;
147 return reinterpret_cast<void*>(base);
148}
149
150inline void* alignedMalloc(size_t bytes) {
151 return alignedMalloc(bytes, kCacheLineSize);
152}
153
154inline void alignedFree(void* ptr) {
155 if (!ptr) {
156 return;
157 }
158 char* p = reinterpret_cast<char*>(ptr);
159 uintptr_t recovered = *reinterpret_cast<uintptr_t*>(p - sizeof(uintptr_t));
160 ::free(reinterpret_cast<void*>(recovered));
161}
162
163template <typename T>
164struct AlignedFreeDeleter {
165 void operator()(T* ptr) {
166 ptr->~T();
167 detail::alignedFree(ptr);
168 }
169};
170template <>
171struct AlignedFreeDeleter<void> {
172 void operator()(void* ptr) {
173 detail::alignedFree(ptr);
174 }
175};
176
177template <typename T, class... Args>
178std::shared_ptr<T> make_shared(Args&&... args) {
179 void* tv = alignedMalloc(sizeof(T), alignof(T));
180 T* t = new (tv) T(std::forward<Args>(args)...);
181 return std::shared_ptr<T>(t, AlignedFreeDeleter<T>());
182}
183
184inline constexpr uintptr_t alignToCacheLine(uintptr_t val) {
185 constexpr uintptr_t kMask = kCacheLineSize - 1;
186 val += kMask;
187 val &= ~kMask;
188 return val;
189}
190
191#if defined __x86_64__ || defined __i386__
192inline void cpuRelax() {
193 asm volatile("pause" ::: "memory");
194}
195#elif defined __arm64__ || defined __aarch64__
196inline void cpuRelax() {
197 asm volatile("yield" ::: "memory");
198}
199#elif defined __powerpc__ || defined __POWERPC__
200#if defined __APPLE__
201inline void cpuRelax() {
202 asm volatile("or r27,r27,r27" ::: "memory");
203}
204#else
205inline void cpuRelax() {
206 asm volatile("or 27,27,27" ::: "memory");
207}
208#endif // APPLE
209#else
210// TODO: provide reasonable relax on other archs.
211inline void cpuRelax() {}
212#endif // ARCH
213
214// When statically chunking a range, it is generally not possible to use a single chunk size plus
215// remainder and get a good load distribution. By estimating too high, we can have idle threads. By
216// estimating too low, the remainder can be several times as large as the chunk for other threads.
217// Instead, we compute the chunk size that is the ceil of the fractional chunk size. That can be
218// used for the first transitionIndex values, while the remaining (chunks - transitionTaskIndex)
219// values will be ceilChunkSize - 1.
220struct StaticChunking {
221 ssize_t transitionTaskIndex;
222 ssize_t ceilChunkSize;
223};
224
225inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) {
226 assert(chunks > 0);
227 StaticChunking chunking;
228 chunking.ceilChunkSize = (items + chunks - 1) / chunks;
229 ssize_t numLeft = chunking.ceilChunkSize * chunks - items;
230 chunking.transitionTaskIndex = chunks - numLeft;
231 return chunking;
232}
233
234} // namespace detail
235} // namespace dispenso
detail::OpResult< T > OpResult
Definition pipeline.h:29