dispenso 1.5.1
A library for task parallelism
Loading...
Searching...
No Matches
parallel_for.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
14#pragma once
15
16#include <cmath>
17#include <limits>
18
19#include <dispenso/detail/can_invoke.h>
20#include <dispenso/detail/per_thread_info.h>
22#include <dispenso/task_set.h>
23
24namespace dispenso {
25
26#if DISPENSO_HAS_CONCEPTS
33template <typename F, typename IntegerT>
34concept ParallelForRangeFunc = std::invocable<F, IntegerT, IntegerT>;
35
42template <typename F, typename IntegerT>
43concept ParallelForIndexFunc = std::invocable<F, IntegerT>;
44
51template <typename F, typename StateRef, typename IntegerT>
52concept ParallelForStateRangeFunc = std::invocable<F, StateRef, IntegerT, IntegerT>;
53
60template <typename F, typename StateRef, typename IntegerT>
61concept ParallelForStateIndexFunc = std::invocable<F, StateRef, IntegerT>;
62#endif // DISPENSO_HAS_CONCEPTS
63
72enum class ParForChunking { kStatic, kAuto };
73
83 uint32_t maxThreads = std::numeric_limits<int32_t>::max();
92 bool wait = true;
93
98 ParForChunking defaultChunking = ParForChunking::kStatic;
99
105 uint32_t minItemsPerChunk = 1;
106
112 bool reuseExistingState = false;
113};
114
125template <typename IntegerT = ssize_t>
126struct ChunkedRange {
127 // We need to utilize 64-bit integers to avoid overflow, e.g. passing -2**30, 2**30 as int32 will
128 // result in overflow unless we cast to 64-bit. Note that if we have a range of e.g. -2**63+1 to
129 // 2**63-1, we cannot hold the result in an int64_t. We could in a uint64_t, but it is quite
130 // tricky to make this work. However, I do not expect ranges larger than can be held in int64_t
131 // since people want their computations to finish before the heat death of the sun (slight
132 // exaggeration).
133 using size_type = std::conditional_t<std::is_signed<IntegerT>::value, int64_t, uint64_t>;
134
135 struct Static {};
136 struct Auto {};
137 static constexpr IntegerT kStatic = std::numeric_limits<IntegerT>::max();
138
146 ChunkedRange(IntegerT s, IntegerT e, IntegerT c) : start(s), end(e), chunk(c) {}
153 ChunkedRange(IntegerT s, IntegerT e, Static) : ChunkedRange(s, e, kStatic) {}
161 ChunkedRange(IntegerT s, IntegerT e, Auto) : ChunkedRange(s, e, 0) {}
162
163 bool isStatic() const {
164 return chunk == kStatic;
165 }
166
167 bool isAuto() const {
168 return chunk == 0;
169 }
170
171 bool empty() const {
172 return end <= start;
173 }
174
175 size_type size() const {
176 return static_cast<size_type>(end) - start;
177 }
178
179 template <typename OtherInt>
180 std::tuple<size_type, size_type>
181 calcChunkSize(OtherInt numLaunched, bool oneOnCaller, size_type minChunkSize) const {
182 size_type workingThreads = static_cast<size_type>(numLaunched) + size_type{oneOnCaller};
183 assert(workingThreads > 0);
184
185 if (!chunk) {
186 size_type dynFactor = std::min<size_type>(16, size() / workingThreads);
187 size_type chunkSize;
188 do {
189 size_type roughChunks = dynFactor * workingThreads;
190 chunkSize = (size() + roughChunks - 1) / roughChunks;
191 --dynFactor;
192 } while (chunkSize < minChunkSize);
193 return {chunkSize, (size() + chunkSize - 1) / chunkSize};
194 } else if (chunk == kStatic) {
195 // This should never be called. The static distribution versions of the parallel_for
196 // functions should be invoked instead.
197 std::abort();
198 }
199 return {chunk, (size() + chunk - 1) / chunk};
200 }
201
202 IntegerT start;
203 IntegerT end;
204 IntegerT chunk;
205};
206
214template <typename IntegerA, typename IntegerB>
215inline ChunkedRange<std::common_type_t<IntegerA, IntegerB>>
216makeChunkedRange(IntegerA start, IntegerB end, ParForChunking chunking = ParForChunking::kStatic) {
217 using IntegerT = std::common_type_t<IntegerA, IntegerB>;
218 return (chunking == ParForChunking::kStatic)
219 ? ChunkedRange<IntegerT>(start, end, typename ChunkedRange<IntegerT>::Static())
220 : ChunkedRange<IntegerT>(start, end, typename ChunkedRange<IntegerT>::Auto());
221}
222
230template <typename IntegerA, typename IntegerB, typename IntegerC>
231inline ChunkedRange<std::common_type_t<IntegerA, IntegerB>>
232makeChunkedRange(IntegerA start, IntegerB end, IntegerC chunkSize) {
233 return ChunkedRange<std::common_type_t<IntegerA, IntegerB>>(start, end, chunkSize);
234}
235
236namespace detail {
237
238struct NoOpIter {
239 using difference_type = std::ptrdiff_t;
240 using value_type = int;
241 using pointer = int*;
242 using reference = int&;
243 using iterator_category = std::random_access_iterator_tag;
244
245 int& operator*() const {
246 static DISPENSO_THREAD_LOCAL int dummy = 0;
247 return dummy;
248 }
249 NoOpIter& operator++() {
250 return *this;
251 }
252 NoOpIter operator++(int) {
253 return *this;
254 }
255 NoOpIter& operator--() {
256 return *this;
257 }
258 NoOpIter operator--(int) {
259 return *this;
260 }
261 NoOpIter& operator+=(difference_type) {
262 return *this;
263 }
264 NoOpIter& operator-=(difference_type) {
265 return *this;
266 }
267 NoOpIter operator+(difference_type) const {
268 return *this;
269 }
270 NoOpIter operator-(difference_type) const {
271 return *this;
272 }
273 difference_type operator-(const NoOpIter&) const {
274 return 0;
275 }
276 bool operator==(const NoOpIter&) const {
277 return true;
278 }
279 bool operator!=(const NoOpIter&) const {
280 return false;
281 }
282 bool operator<(const NoOpIter&) const {
283 return false;
284 }
285 int& operator[](difference_type) const {
286 static DISPENSO_THREAD_LOCAL int dummy = 0;
287 return dummy;
288 }
289};
290
291struct NoOpContainer {
292 size_t size() const {
293 return 0;
294 }
295
296 bool empty() const {
297 return true;
298 }
299
300 void clear() {}
301
302 NoOpIter begin() {
303 return {};
304 }
305
306 void emplace_back(int) {}
307
308 int& front() {
309 static int i;
310 return i;
311 }
312};
313
314struct NoOpStateGen {
315 int operator()() const {
316 return 0;
317 }
318};
319
325template <typename StateContainer, typename StateGen>
326void initStates(
327 StateContainer& states,
328 const StateGen& defaultState,
329 size_t numNeeded,
330 bool reuseExistingState) {
331 if (!reuseExistingState) {
332 states.clear();
333 }
334 for (size_t i = states.size(); i < numNeeded; ++i) {
335 states.emplace_back(defaultState());
336 }
337}
338
339template <
340 typename TaskSetT,
341 typename IntegerT,
342 typename F,
343 typename StateContainer,
344 typename StateGen>
345void parallel_for_staticImpl(
346 TaskSetT& taskSet,
347 StateContainer& states,
348 const StateGen& defaultState,
349 const ChunkedRange<IntegerT>& range,
350 F&& f,
351 ssize_t maxThreads,
352 bool wait,
353 bool reuseExistingState) {
354 using size_type = typename ChunkedRange<IntegerT>::size_type;
355
356 size_type numThreads = std::min<size_type>(taskSet.numPoolThreads() + wait, maxThreads);
357 // Reduce threads used if they exceed work to be done.
358 numThreads = std::min(numThreads, range.size());
359
360 detail::initStates(states, defaultState, static_cast<size_t>(numThreads), reuseExistingState);
361
362 auto chunking =
363 detail::staticChunkSize(static_cast<ssize_t>(range.size()), static_cast<ssize_t>(numThreads));
364 IntegerT chunkSize = static_cast<IntegerT>(chunking.ceilChunkSize);
365
366 bool perfectlyChunked = static_cast<size_type>(chunking.transitionTaskIndex) == numThreads;
367
368 // Number of tasks to schedule (all but the last one if wait is true)
369 size_type numToSchedule = wait ? numThreads - 1 : numThreads;
370
371 if (numToSchedule > 0) {
372 // Precompute range boundaries for the generator
373 // First loop: indices [0, transitionTaskIndex) use ceilChunkSize
374 // Second loop: indices [transitionTaskIndex, numToSchedule) use ceilChunkSize - 1
375 size_type transitionIdx = perfectlyChunked ? numToSchedule : chunking.transitionTaskIndex;
376 IntegerT smallChunkSize = static_cast<IntegerT>(chunkSize - !perfectlyChunked);
377
378 taskSet.scheduleBulk(
379 static_cast<size_t>(numToSchedule),
380 [&, chunkSize, smallChunkSize, transitionIdx](size_t idx) {
381 // Calculate start position for this chunk
382 IntegerT start;
383 IntegerT thisChunkSize;
384 if (static_cast<size_type>(idx) < transitionIdx) {
385 IntegerT i = static_cast<IntegerT>(idx);
386 start = static_cast<IntegerT>(range.start + static_cast<IntegerT>(i * chunkSize));
387 thisChunkSize = chunkSize;
388 } else {
389 // After transition, chunks are smaller by 1
390 IntegerT ti = static_cast<IntegerT>(transitionIdx);
391 IntegerT ri = static_cast<IntegerT>(idx - transitionIdx);
392 start = static_cast<IntegerT>(
393 range.start + static_cast<IntegerT>(ti * chunkSize) +
394 static_cast<IntegerT>(ri * smallChunkSize));
395 thisChunkSize = smallChunkSize;
396 }
397 IntegerT end = static_cast<IntegerT>(start + thisChunkSize);
398
399 auto stateIt = states.begin();
400 std::advance(stateIt, static_cast<ptrdiff_t>(idx));
401
402 return [it = stateIt, start, end, f]() {
403 auto recurseInfo = detail::PerPoolPerThreadInfo::parForRecurse();
404 f(*it, start, end);
405 };
406 });
407 }
408
409 if (wait) {
410 // Execute the last chunk on the calling thread
411 auto stateIt = states.begin();
412 std::advance(stateIt, static_cast<ptrdiff_t>(numThreads - 1));
413 // Calculate start of last chunk
414 size_type transitionIdx = perfectlyChunked ? numThreads - 1 : chunking.transitionTaskIndex;
415 IntegerT smallChunkSize = static_cast<IntegerT>(chunkSize - !perfectlyChunked);
416 IntegerT lastStart;
417 if (numThreads - 1 < transitionIdx) {
418 IntegerT i = static_cast<IntegerT>(numThreads - 1);
419 lastStart = static_cast<IntegerT>(range.start + static_cast<IntegerT>(i * chunkSize));
420 } else {
421 IntegerT ti = static_cast<IntegerT>(transitionIdx);
422 IntegerT ri = static_cast<IntegerT>(numThreads - 1 - transitionIdx);
423 lastStart = static_cast<IntegerT>(
424 range.start + static_cast<IntegerT>(ti * chunkSize) +
425 static_cast<IntegerT>(ri * smallChunkSize));
426 }
427 f(*stateIt, lastStart, range.end);
428 taskSet.wait();
429 }
430}
431
432template <typename IntegerT>
433struct ChunkSizingResult {
434 typename ChunkedRange<IntegerT>::size_type maxThreads;
435 bool isStatic;
436};
437
450template <typename IntegerT>
451ChunkSizingResult<IntegerT> adjustChunkSizing(
452 const ChunkedRange<IntegerT>& range,
453 typename ChunkedRange<IntegerT>::size_type maxThreads,
454 bool isStatic,
455 uint32_t minItemsPerChunk,
456 typename ChunkedRange<IntegerT>::size_type poolThreads,
457 bool wait) {
458 using size_type = typename ChunkedRange<IntegerT>::size_type;
459
460 // Step 1: never use more threads than the pool actually has
461 maxThreads = std::min<size_type>(maxThreads, poolThreads + wait);
462
463 if (minItemsPerChunk > 1) {
464 // Step 2a: reduce threads so each gets at least minItemsPerChunk items
465 size_type maxWorkers = range.size() / minItemsPerChunk;
466 if (maxWorkers < maxThreads) {
467 maxThreads = maxWorkers;
468 }
469 // Step 2b: if dynamic chunks would still be too small, use static scheduling
470 if (maxThreads > 0 && range.size() / (maxThreads + wait) < minItemsPerChunk && range.isAuto()) {
471 isStatic = true;
472 }
473 } else if (range.size() <= poolThreads + wait) {
474 // Step 3: fewer items than threads — static is better (no atomic overhead)
475 if (range.isAuto()) {
476 isStatic = true;
477 } else if (!range.isStatic()) {
478 maxThreads = range.size() - wait;
479 }
480 }
481
482 return {maxThreads, isStatic};
483}
484
496template <
497 typename TaskSetT,
498 typename IntegerT,
499 typename F,
500 typename StateContainer,
501 typename IndexRef,
502 typename ExitAction>
503void parallel_for_dynamicImpl(
504 TaskSetT& taskSet,
505 StateContainer& states,
506 IntegerT start,
507 IntegerT end,
508 F&& f,
509 size_t numToLaunch,
510 typename ChunkedRange<IntegerT>::size_type chunkSize,
511 typename ChunkedRange<IntegerT>::size_type numChunks,
512 IndexRef& index,
513 ExitAction exitAction,
514 bool wait) {
515 auto worker = [start, end, &index, f, chunkSize, numChunks, exitAction](auto& s) {
516 auto recurseInfo = detail::PerPoolPerThreadInfo::parForRecurse();
517 while (true) {
518 auto cur = index.fetch_add(1, std::memory_order_relaxed);
519 if (cur >= numChunks) {
520 exitAction(cur);
521 break;
522 }
523 auto sidx = static_cast<IntegerT>(start + cur * chunkSize);
524 if (cur + 1 == numChunks) {
525 f(s, sidx, end);
526 } else {
527 f(s, sidx, static_cast<IntegerT>(sidx + chunkSize));
528 }
529 }
530 };
531
532 taskSet.scheduleBulk(static_cast<size_t>(numToLaunch), [&states, &worker](size_t i) {
533 auto it = states.begin();
534 std::advance(it, static_cast<ptrdiff_t>(i));
535 return [&s = *it, worker]() { worker(s); };
536 });
537
538 if (wait) {
539 auto it = states.begin();
540 std::advance(it, static_cast<ptrdiff_t>(numToLaunch));
541 worker(*it);
542 taskSet.wait();
543 }
544}
545
546} // namespace detail
547
564template <
565 typename TaskSetT,
566 typename IntegerT,
567 typename F,
568 typename StateContainer,
569 typename StateGen>
571 TaskSetT& taskSet,
572 StateContainer& states,
573 const StateGen& defaultState,
574 const ChunkedRange<IntegerT>& range,
575 F&& f,
576 ParForOptions options = {}) {
577 if (range.empty()) {
578 if (options.wait) {
579 taskSet.wait();
580 }
581 return;
582 }
583
584 using size_type = typename ChunkedRange<IntegerT>::size_type;
585
586 uint32_t minItemsPerChunk = std::max<uint32_t>(1, options.minItemsPerChunk);
587 size_type maxThreads = std::max<int32_t>(options.maxThreads, 1);
588 bool isStatic = range.isStatic();
589
590 const size_type N = taskSet.numPoolThreads();
591 if (N == 0 || !options.maxThreads || range.size() <= minItemsPerChunk ||
592 detail::PerPoolPerThreadInfo::isParForRecursive(&taskSet.pool())) {
593 detail::initStates(states, defaultState, 1, options.reuseExistingState);
594 f(*states.begin(), range.start, range.end);
595 if (options.wait) {
596 taskSet.wait();
597 }
598 return;
599 }
600
601 auto chunkSizing =
602 detail::adjustChunkSizing(range, maxThreads, isStatic, minItemsPerChunk, N, options.wait);
603 maxThreads = chunkSizing.maxThreads;
604 isStatic = chunkSizing.isStatic;
605
606 // If adjustment reduced threads below 2, run inline — not worth parallelizing.
607 if (maxThreads < 2) {
608 detail::initStates(states, defaultState, 1, options.reuseExistingState);
609 f(*states.begin(), range.start, range.end);
610 if (options.wait) {
611 taskSet.wait();
612 }
613 return;
614 }
615
616 if (isStatic) {
617 detail::parallel_for_staticImpl(
618 taskSet,
619 states,
620 defaultState,
621 range,
622 std::forward<F>(f),
623 static_cast<ssize_t>(maxThreads),
624 options.wait,
625 options.reuseExistingState);
626 return;
627 }
628
629 const size_type numToLaunch = std::min<size_type>(maxThreads - options.wait, N);
630
631 detail::initStates(
632 states,
633 defaultState,
634 static_cast<size_t>(numToLaunch + options.wait),
635 options.reuseExistingState);
636
637 if (numToLaunch == 1 && !options.wait) {
638 taskSet.schedule(
639 [&s = states.front(), range, f = std::move(f)]() { f(s, range.start, range.end); });
640 return;
641 }
642
643 auto chunkInfo = range.calcChunkSize(numToLaunch, options.wait, minItemsPerChunk);
644 auto chunkSize = std::get<0>(chunkInfo);
645 auto numChunks = std::get<1>(chunkInfo);
646
647 if (options.wait) {
648 alignas(kCacheLineSize) std::atomic<decltype(numChunks)> index(0);
649 detail::parallel_for_dynamicImpl(
650 taskSet,
651 states,
652 range.start,
653 range.end,
654 std::forward<F>(f),
655 static_cast<size_t>(numToLaunch),
656 chunkSize,
657 numChunks,
658 index,
659 [](auto) {},
660 options.wait);
661 } else {
662 using SizeType = decltype(numChunks);
663 struct ChunkIndex {
664 std::atomic<SizeType> index;
665 };
666 static_assert(sizeof(ChunkIndex) <= kCacheLineSize, "ChunkIndex must fit in one cache line");
667 char* mem = allocSmallBuffer<kCacheLineSize>();
668 auto* ci = new (mem) ChunkIndex{{0}};
669 SizeType lastExit = numChunks + static_cast<SizeType>(numToLaunch) - 1;
670 detail::parallel_for_dynamicImpl(
671 taskSet,
672 states,
673 range.start,
674 range.end,
675 std::forward<F>(f),
676 static_cast<size_t>(numToLaunch),
677 chunkSize,
678 numChunks,
679 ci->index,
680 [ci, lastExit](auto cur) {
681 if (cur == lastExit) {
682 deallocSmallBuffer<kCacheLineSize>(ci);
683 }
684 },
685 options.wait);
686 }
687}
688
698template <typename TaskSetT, typename IntegerT, typename F>
699DISPENSO_REQUIRES(ParallelForRangeFunc<F, IntegerT>)
700void parallel_for(
701 TaskSetT& taskSet,
702 const ChunkedRange<IntegerT>& range,
703 F&& f,
704 ParForOptions options = {}) {
705 detail::NoOpContainer container;
706 parallel_for(
707 taskSet,
708 container,
709 detail::NoOpStateGen(),
710 range,
711 [f = std::move(f)](int /*noop*/, auto i, auto j) { f(i, j); },
712 options);
713}
714
724template <typename IntegerT, typename F>
725DISPENSO_REQUIRES(ParallelForRangeFunc<F, IntegerT>)
726void parallel_for(const ChunkedRange<IntegerT>& range, F&& f, ParForOptions options = {}) {
727 TaskSet taskSet(globalThreadPool());
728 options.wait = true;
729 parallel_for(taskSet, range, std::forward<F>(f), options);
730}
731
749template <typename F, typename IntegerT, typename StateContainer, typename StateGen>
751 StateContainer& states,
752 const StateGen& defaultState,
753 const ChunkedRange<IntegerT>& range,
754 F&& f,
755 ParForOptions options = {}) {
756 TaskSet taskSet(globalThreadPool());
757 options.wait = true;
758 parallel_for(taskSet, states, defaultState, range, std::forward<F>(f), options);
759}
760
771template <
772 typename TaskSetT,
773 typename IntegerA,
774 typename IntegerB,
775 typename F,
776 std::enable_if_t<std::is_integral<IntegerA>::value, bool> = true,
777 std::enable_if_t<std::is_integral<IntegerB>::value, bool> = true,
778 std::enable_if_t<detail::CanInvoke<F(IntegerA)>::value, bool> = true>
780 TaskSetT& taskSet,
781 IntegerA start,
782 IntegerB end,
783 F&& f,
784 ParForOptions options = {}) {
785 using IntegerT = std::common_type_t<IntegerA, IntegerB>;
786
787 auto range = makeChunkedRange(start, end, options.defaultChunking);
788 parallel_for(
789 taskSet,
790 range,
791 [f = std::move(f)](IntegerT s, IntegerT e) {
792 for (IntegerT i = s; i < e; ++i) {
793 f(i);
794 }
795 },
796 options);
797}
798
800template <
801 typename TaskSetT,
802 typename IntegerA,
803 typename IntegerB,
804 typename F,
805 std::enable_if_t<std::is_integral<IntegerA>::value, bool> = true,
806 std::enable_if_t<std::is_integral<IntegerB>::value, bool> = true,
807 std::enable_if_t<detail::CanInvoke<F(IntegerA, IntegerB)>::value, bool> = true>
808void parallel_for(
809 TaskSetT& taskSet,
810 IntegerA start,
811 IntegerB end,
812 F&& f,
813 ParForOptions options = {}) {
814 auto range = makeChunkedRange(start, end, options.defaultChunking);
815 parallel_for(taskSet, range, std::forward<F>(f), options);
816}
817
828template <
829 typename IntegerA,
830 typename IntegerB,
831 typename F,
832 std::enable_if_t<std::is_integral<IntegerA>::value, bool> = true,
833 std::enable_if_t<std::is_integral<IntegerB>::value, bool> = true>
834void parallel_for(IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) {
835 TaskSet taskSet(globalThreadPool());
836 options.wait = true;
837 parallel_for(taskSet, start, end, std::forward<F>(f), options);
838}
839
858template <
859 typename TaskSetT,
860 typename IntegerA,
861 typename IntegerB,
862 typename F,
863 typename StateContainer,
864 typename StateGen,
865 std::enable_if_t<std::is_integral<IntegerA>::value, bool> = true,
866 std::enable_if_t<std::is_integral<IntegerB>::value, bool> = true,
867 std::enable_if_t<
868 detail::CanInvoke<F(typename StateContainer::reference, IntegerA)>::value,
869 bool> = true>
871 TaskSetT& taskSet,
872 StateContainer& states,
873 const StateGen& defaultState,
874 IntegerA start,
875 IntegerB end,
876 F&& f,
877 ParForOptions options = {}) {
878 using IntegerT = std::common_type_t<IntegerA, IntegerB>;
879 auto range = makeChunkedRange(start, end, options.defaultChunking);
880 parallel_for(
881 taskSet,
882 states,
883 defaultState,
884 range,
885 [f = std::move(f)](auto& state, IntegerT s, IntegerT e) {
886 for (IntegerT i = s; i < e; ++i) {
887 f(state, i);
888 }
889 },
890 options);
891}
892
894template <
895 typename TaskSetT,
896 typename IntegerA,
897 typename IntegerB,
898 typename F,
899 typename StateContainer,
900 typename StateGen,
901 std::enable_if_t<std::is_integral<IntegerA>::value, bool> = true,
902 std::enable_if_t<std::is_integral<IntegerB>::value, bool> = true,
903 std::enable_if_t<
904 detail::CanInvoke<F(typename StateContainer::reference, IntegerA, IntegerB)>::value,
905 bool> = true>
906void parallel_for(
907 TaskSetT& taskSet,
908 StateContainer& states,
909 const StateGen& defaultState,
910 IntegerA start,
911 IntegerB end,
912 F&& f,
913 ParForOptions options = {}) {
914 auto range = makeChunkedRange(start, end, options.defaultChunking);
915 parallel_for(taskSet, states, defaultState, range, std::forward<F>(f), options);
916}
917
937template <
938 typename IntegerA,
939 typename IntegerB,
940 typename F,
941 typename StateContainer,
942 typename StateGen,
943 std::enable_if_t<std::is_integral<IntegerA>::value, bool> = true,
944 std::enable_if_t<std::is_integral<IntegerB>::value, bool> = true>
946 StateContainer& states,
947 const StateGen& defaultState,
948 IntegerA start,
949 IntegerB end,
950 F&& f,
951 ParForOptions options = {}) {
952 TaskSet taskSet(globalThreadPool());
953 options.wait = true;
954 parallel_for(taskSet, states, defaultState, start, end, std::forward<F>(f), options);
955}
956
957} // namespace dispenso
void parallel_for(TaskSetT &taskSet, StateContainer &states, const StateGen &defaultState, const ChunkedRange< IntegerT > &range, F &&f, ParForOptions options={})
ChunkedRange< std::common_type_t< IntegerA, IntegerB > > makeChunkedRange(IntegerA start, IntegerB end, ParForChunking chunking=ParForChunking::kStatic)
constexpr size_t kCacheLineSize
A constant that defines a safe number of bytes+alignment to avoid false sharing.
Definition platform.h:97
ParForChunking defaultChunking