Автоматизация ввода-вывода CUDA/Пример внедрения автоматизации: различия между версиями
Перейти к навигации
Перейти к поиску
Alinap95 (обсуждение | вклад) (Новая страница: «<source lang="cpp"> #include <vector> #include <cuda/vector.cuh> #include <cuda/automation> class SomeClass; #ifdef __CUDACC__ class SomeClassCuda; template <…») |
Alinap95 (обсуждение | вклад) |
||
Строка 1: | Строка 1: | ||
<source lang="cpp"> | <source lang="cpp"> | ||
#include <vector> | #include <vector> | ||
template <class T> | template <class T> | ||
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v) | |||
{ | { | ||
is << v.size(); | |||
for (auto val:v) | |||
is << val; | |||
return is; | return is; | ||
} | } | ||
template <class T> | template <class T> | ||
struct device_vector | |||
{ | { | ||
std::size_t m_count = std::size_t(); | |||
T* m_pElements = nullptr; | |||
public: | |||
typedef T value_type, *pointer, &reference; | |||
using custom_cuda_device_read = std::true_type; | |||
device_vector() = default; | |||
__device__ explicit device_vector(std::size_t c, const T& init = T()) | |||
{ | |||
m_pElements = new T[c]; | |||
if (m_pElements == nullptr) return; | |||
fill(m_pElements, m_pElements + c, init); | |||
m_count = c; | |||
} | |||
__device__ device_vector(const T* data, std::size_t c) | |||
{ | |||
m_pElements = new T[c]; | |||
if (m_pElements == nullptr) return; | |||
copy(data, data + c, m_pElements); | |||
m_count = c; | |||
} | |||
__device__ ~device_vector() | |||
{ | |||
if (m_pElements != nullptr) | |||
delete [] m_pElements; | |||
} | |||
__device__ device_vector(const device_vector& right) {*this = right;} | |||
__device__ device_vector(device_vector&& right) {*this = std::move(right);} | |||
__device__ device_vector& operator=(device_vector&& right) | |||
{ | |||
if (this == &right) | |||
return *this; | |||
if (m_pElements != nullptr) | |||
delete [] m_pElements; | |||
m_pElements = right.m_pElements; | |||
right.m_pElements = nullptr; | |||
m_count = right.m_count; | |||
right.m_count = std::size_t(); | |||
return *this; | |||
} | |||
__device__ device_vector& operator=(const device_vector& right) | |||
{ | |||
if (this == &right) | |||
return *this; | |||
auto data = new T[right.size()]; | |||
memcpy(data, right.data(), c); | |||
return *this = device_vector(data, c); | |||
} | |||
__device__ const T* data() const | |||
{ | |||
return m_pElements; | |||
} | |||
__device__ std::size_t size() const | |||
{ | |||
return m_count; | |||
} | |||
__device__ const T& operator[](std::size_t i) const | |||
{ | |||
return m_pElements[i]; | |||
} | |||
__device__ T& operator[](std::size_t i) | |||
{ | |||
return m_pElements[i]; | |||
} | |||
}; | |||
template <class T> struct is_device_vector:std::false_type {}; | |||
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {}; | |||
class | template <class T> | ||
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T> | |||
{ | { | ||
using size_type = typename std::vector<typename T::value_type>::size_type; | |||
auto count = read_as<size_type>(is); | |||
auto v = T(count); | |||
for (size_type i = 0; i < count; ++i) | |||
v[i] = read_as<typename T::value_type>(is); | |||
return v; | |||
} | |||
} | |||
template <class T> | template <class T> | ||
__global__ void sum_krnl(cuda_input_buffer buf, T* result) | |||
{ | { | ||
auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x); | |||
static device_vector<T>* pv; | |||
static bool fStop = false; | |||
if (iThread == 0) | |||
{ | |||
auto is = cuda_input_buffer_stream_d(buf); | |||
pv = new device_vector<T>(read_as<device_vector<T>>(is)); | |||
} | |||
__syncthreads(); | |||
auto& v = *pv; | |||
auto iElement = iThread * 2; | |||
std::size_t iter = 0; | |||
while (!fStop) | |||
{ | |||
auto iNextElement = iElement + (std::size_t(1) << iter++); | |||
if (iNextElement < v.size()) | |||
v[iElement] += v[iNextElement]; | |||
else if (iThread == 0) | |||
fStop = true; | |||
__syncthreads(); | |||
} | |||
__syncthreads(); | |||
if (iThread == 0) | |||
{ | |||
*result = v[iElement]; | |||
delete pv; | |||
} | |||
} | } | ||
#include <iostream> | |||
#include <random> | |||
# | |||
int main() | |||
{ | { | ||
auto sum = [](const auto& v) | |||
{ | |||
cuda_input_buffer_stream os; | |||
return | os << v; | ||
using value_type = typename std::decay_t<decltype(v)>::value_type; | |||
value_type* pResult; | |||
auto err = cudaDeviceReset(); | |||
if (err != cudaSuccess) | |||
throw std::runtime_error("CUDA exception"); | |||
err = cudaMalloc(&pResult, sizeof(value_type)); | |||
if (err != cudaSuccess) | |||
throw std::runtime_error("CUDA exception"); | |||
value_type result; | |||
sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult); | |||
err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost); | |||
if (err != cudaSuccess) | |||
throw std::runtime_error("CUDA exception"); | |||
err = cudaDeviceSynchronize(); | |||
if (err != cudaSuccess) | |||
throw std::runtime_error("CUDA exception"); | |||
err = cudaFree(pResult); | |||
if (err != cudaSuccess) | |||
throw std::runtime_error("CUDA exception"); | |||
return result; | |||
}; | |||
std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; | |||
std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n"; | |||
std::minstd_rand rnd; | |||
std::vector<float> v_flt; | |||
for (auto i = 0; i < 1000; ++i) | |||
v_flt.emplace_back(std::generate_canonical<float, 16>(rnd)); | |||
std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n"; | |||
return 0; | |||
} | } | ||
</source> | </source> |
Текущая версия на 18:24, 23 июня 2019
#include <vector>
template <class T>
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
{
is << v.size();
for (auto val:v)
is << val;
return is;
}
template <class T>
struct device_vector
{
std::size_t m_count = std::size_t();
T* m_pElements = nullptr;
public:
typedef T value_type, *pointer, &reference;
using custom_cuda_device_read = std::true_type;
device_vector() = default;
__device__ explicit device_vector(std::size_t c, const T& init = T())
{
m_pElements = new T[c];
if (m_pElements == nullptr) return;
fill(m_pElements, m_pElements + c, init);
m_count = c;
}
__device__ device_vector(const T* data, std::size_t c)
{
m_pElements = new T[c];
if (m_pElements == nullptr) return;
copy(data, data + c, m_pElements);
m_count = c;
}
__device__ ~device_vector()
{
if (m_pElements != nullptr)
delete [] m_pElements;
}
__device__ device_vector(const device_vector& right) {*this = right;}
__device__ device_vector(device_vector&& right) {*this = std::move(right);}
__device__ device_vector& operator=(device_vector&& right)
{
if (this == &right)
return *this;
if (m_pElements != nullptr)
delete [] m_pElements;
m_pElements = right.m_pElements;
right.m_pElements = nullptr;
m_count = right.m_count;
right.m_count = std::size_t();
return *this;
}
__device__ device_vector& operator=(const device_vector& right)
{
if (this == &right)
return *this;
auto data = new T[right.size()];
memcpy(data, right.data(), c);
return *this = device_vector(data, c);
}
__device__ const T* data() const
{
return m_pElements;
}
__device__ std::size_t size() const
{
return m_count;
}
__device__ const T& operator[](std::size_t i) const
{
return m_pElements[i];
}
__device__ T& operator[](std::size_t i)
{
return m_pElements[i];
}
};
template <class T> struct is_device_vector:std::false_type {};
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
{
using size_type = typename std::vector<typename T::value_type>::size_type;
auto count = read_as<size_type>(is);
auto v = T(count);
for (size_type i = 0; i < count; ++i)
v[i] = read_as<typename T::value_type>(is);
return v;
}
template <class T>
__global__ void sum_krnl(cuda_input_buffer buf, T* result)
{
auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
static device_vector<T>* pv;
static bool fStop = false;
if (iThread == 0)
{
auto is = cuda_input_buffer_stream_d(buf);
pv = new device_vector<T>(read_as<device_vector<T>>(is));
}
__syncthreads();
auto& v = *pv;
auto iElement = iThread * 2;
std::size_t iter = 0;
while (!fStop)
{
auto iNextElement = iElement + (std::size_t(1) << iter++);
if (iNextElement < v.size())
v[iElement] += v[iNextElement];
else if (iThread == 0)
fStop = true;
__syncthreads();
}
__syncthreads();
if (iThread == 0)
{
*result = v[iElement];
delete pv;
}
}
#include <iostream>
#include <random>
int main()
{
auto sum = [](const auto& v)
{
cuda_input_buffer_stream os;
os << v;
using value_type = typename std::decay_t<decltype(v)>::value_type;
value_type* pResult;
auto err = cudaDeviceReset();
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaMalloc(&pResult, sizeof(value_type));
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
value_type result;
sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaFree(pResult);
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
return result;
};
std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
std::minstd_rand rnd;
std::vector<float> v_flt;
for (auto i = 0; i < 1000; ++i)
v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
return 0;
}