Автоматизация ввода-вывода CUDA/Пример внедрения автоматизации
Перейти к навигации
Перейти к поиску
#include <vector>
template <class T>
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
{
is << v.size();
for (auto val:v)
is << val;
return is;
}
template <class T>
struct device_vector
{
std::size_t m_count = std::size_t();
T* m_pElements = nullptr;
public:
typedef T value_type, *pointer, &reference;
using custom_cuda_device_read = std::true_type;
device_vector() = default;
__device__ explicit device_vector(std::size_t c, const T& init = T())
{
m_pElements = new T[c];
if (m_pElements == nullptr) return;
fill(m_pElements, m_pElements + c, init);
m_count = c;
}
__device__ device_vector(const T* data, std::size_t c)
{
m_pElements = new T[c];
if (m_pElements == nullptr) return;
copy(data, data + c, m_pElements);
m_count = c;
}
__device__ ~device_vector()
{
if (m_pElements != nullptr)
delete [] m_pElements;
}
__device__ device_vector(const device_vector& right) {*this = right;}
__device__ device_vector(device_vector&& right) {*this = std::move(right);}
__device__ device_vector& operator=(device_vector&& right)
{
if (this == &right)
return *this;
if (m_pElements != nullptr)
delete [] m_pElements;
m_pElements = right.m_pElements;
right.m_pElements = nullptr;
m_count = right.m_count;
right.m_count = std::size_t();
return *this;
}
__device__ device_vector& operator=(const device_vector& right)
{
if (this == &right)
return *this;
auto data = new T[right.size()];
memcpy(data, right.data(), c);
return *this = device_vector(data, c);
}
__device__ const T* data() const
{
return m_pElements;
}
__device__ std::size_t size() const
{
return m_count;
}
__device__ const T& operator[](std::size_t i) const
{
return m_pElements[i];
}
__device__ T& operator[](std::size_t i)
{
return m_pElements[i];
}
};
template <class T> struct is_device_vector:std::false_type {};
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
{
using size_type = typename std::vector<typename T::value_type>::size_type;
auto count = read_as<size_type>(is);
auto v = T(count);
for (size_type i = 0; i < count; ++i)
v[i] = read_as<typename T::value_type>(is);
return v;
}
template <class T>
__global__ void sum_krnl(cuda_input_buffer buf, T* result)
{
auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
static device_vector<T>* pv;
static bool fStop = false;
if (iThread == 0)
{
auto is = cuda_input_buffer_stream_d(buf);
pv = new device_vector<T>(read_as<device_vector<T>>(is));
}
__syncthreads();
auto& v = *pv;
auto iElement = iThread * 2;
std::size_t iter = 0;
while (!fStop)
{
auto iNextElement = iElement + (std::size_t(1) << iter++);
if (iNextElement < v.size())
v[iElement] += v[iNextElement];
else if (iThread == 0)
fStop = true;
__syncthreads();
}
__syncthreads();
if (iThread == 0)
{
*result = v[iElement];
delete pv;
}
}
#include <iostream>
#include <random>
int main()
{
auto sum = [](const auto& v)
{
cuda_input_buffer_stream os;
os << v;
using value_type = typename std::decay_t<decltype(v)>::value_type;
value_type* pResult;
auto err = cudaDeviceReset();
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaMalloc(&pResult, sizeof(value_type));
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
value_type result;
sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaFree(pResult);
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
return result;
};
std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
std::minstd_rand rnd;
std::vector<float> v_flt;
for (auto i = 0; i < 1000; ++i)
v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
return 0;
}