#include <thread>
#include <memory>
#include <vector>
struct partial_sums_t
{
double val;
char padding[64u - sizeof(double)];
};
struct barrier_completion_functor
{
static unsigned count;
void operator()() const noexcept
{
++count;
}
};
unsigned barrier_completion_functor::count = 0;
bool latch_barrier_test(double seed, std::size_t element_count)
{
double expected_average = (seed + seed + element_count - 1) / 2.0;
constexpr unsigned thread_count = 4u;
constexpr unsigned reduction_count = 2u;
unsigned expected_barrier_completion_count = 0;
auto result = std::make_unique<double[]>(element_count);
auto partial_sums = std::make_unique<partial_sums_t[]>(thread_count);
latch ltch_countdown_wait{thread_count};
latch ltch{thread_count};
barrier<barrier_completion_functor> bar{thread_count};
if (ltch.try_wait())
return false;
std::vector<std::thread> threads;
threads.reserve(thread_count);
barrier_completion_functor::count = 0;
auto thread_fn = [<ch_countdown_wait, <ch, &bar, &result, &partial_sums, &expected_barrier_completion_count, thread_count, seed, element_count](unsigned t)
{
if (t == 0)
ltch_countdown_wait.wait();
else if (t == 1)
ltch_countdown_wait.count_down(2);
else
ltch_countdown_wait.count_down();
for (std::size_t i = t; i < element_count; i += thread_count)
result[i] = seed + i;
ltch.arrive_and_wait();
std::size_t block = element_count / thread_count;
std::size_t off;
if (t >= element_count % thread_count)
off = block * t + (element_count % thread_count);
else
{
++block;
off = block * t;
}
double partial_sum = 0.0;
for (std::size_t i = off; i < off + block; ++i)
partial_sum += result[i];
partial_sums[t].val = partial_sum;
if (t < thread_count / 2)
bar.arrive_and_wait();
else
bar.wait(bar.arrive());
if (!t)
++expected_barrier_completion_count;
std::size_t stage = 1;
while (stage < thread_count)
{
std::size_t stage_next = stage + stage;
std::size_t other = t + stage;
if ((t & (stage_next - 1)) == 0 && other < thread_count)
partial_sums[t].val += partial_sums[other].val;
stage = stage_next;
bar.arrive_and_wait();
if (!t)
++expected_barrier_completion_count;
}
if (!t)
bar.arrive_and_drop();
else
bar.arrive_and_wait();
if (!t)
++expected_barrier_completion_count;
if (!t)
++expected_barrier_completion_count;
else
bar.arrive_and_wait();
};
for (unsigned t = 1u; t < thread_count; ++t)
threads.emplace_back(thread_fn, t);
thread_fn(0);
for (auto& thread:threads)
thread.join();
partial_sums[0].val /= element_count;
return expected_barrier_completion_count == barrier_completion_functor::count && partial_sums[0].val == expected_average;
}
An interface for the portable implementation of threads as defined in C11 standard supplemented with ...
Implements a reusable barrier synchronization primitive.
Definition: chsvthrd.h:1943
Implements a single-use barrier synchronization.
Definition: chsvthrd.h:1851
#define verify(expression)
Verifies that a parametrically specified predicate expression is satisfied and calls std::abort,...
Definition: chsverr.h:752