Автоматизация ввода-вывода CUDA/Пример внедрения автоматизации: различия между версиями

Материал из CAMaaS preliminary wiki
Перейти к навигации Перейти к поиску
(Новая страница: «<source lang="cpp"> #include <vector> #include <cuda/vector.cuh> #include <cuda/automation> class SomeClass; #ifdef __CUDACC__ class SomeClassCuda; template <…»)
 
 
Строка 1: Строка 1:
<source lang="cpp">
<source lang="cpp">
#include <vector>
#include <vector>
#include <cuda/vector.cuh>
#include <cuda/automation>


class SomeClass;
#ifdef __CUDACC__
class SomeClassCuda;
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is)->std::enable_if_t<std::is_same<SomeClassCuda, T>::value, T>;
template <class T>
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is)->std::enable_if_t<std::is_same<ComplexObject, T>::value, T>;
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
 
#endif  //__CUDACC__
 
struct ComplexObject
{
{
int first;
is << v.size();
int second;
for (auto val:v)
#ifdef __CUDACC__
is << val;
template <class T>
friend __device__ auto read_as(cuda_input_buffer_stream_d& is)->std::enable_if_t<std::is_same<ComplexObject, T>::value, T>;
friend cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const ComplexObject& obj);
#endif  //__CUDACC__
};
 
#ifdef __CUDACC__
 
__host__ cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const ComplexObject& obj)
{
is << first;
is << second;
return is;
return is;
}
}


template <class T>
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<std::is_same<ComplexObject, T>::value, T>
struct device_vector
{
{
ComplexObject result;
std::size_t m_count = std::size_t();
T* m_pElements = nullptr;
public:
typedef T value_type, *pointer, &reference;
using custom_cuda_device_read = std::true_type;
device_vector() = default;
__device__ explicit device_vector(std::size_t c, const T& init = T())
{
m_pElements = new T[c];
if (m_pElements == nullptr) return;
fill(m_pElements, m_pElements + c, init);
m_count = c;
}
__device__ device_vector(const T* data, std::size_t c)
{
m_pElements = new T[c];
if (m_pElements == nullptr) return;
copy(data, data + c, m_pElements);
m_count = c;
}
__device__ ~device_vector()
{
if (m_pElements != nullptr)
delete [] m_pElements;
}
__device__ device_vector(const device_vector& right) {*this = right;}
__device__ device_vector(device_vector&& right) {*this = std::move(right);}
__device__ device_vector& operator=(device_vector&& right)
{
if (this == &right)
return *this;
if (m_pElements != nullptr)
delete [] m_pElements;
m_pElements = right.m_pElements;
right.m_pElements = nullptr;
m_count = right.m_count;
right.m_count = std::size_t();
return *this;
}
__device__ device_vector& operator=(const device_vector& right)
{
if (this == &right)
return *this;
auto data = new T[right.size()];
memcpy(data, right.data(), c);
return *this = device_vector(data, c);
}
__device__ const T* data() const
{
return m_pElements;
}
__device__ std::size_t size() const
{
return m_count;
}
__device__ const T& operator[](std::size_t i) const
{
return m_pElements[i];
}
__device__ T& operator[](std::size_t i)
{
return m_pElements[i];
}
};


result.first = read_as<int>(is);
template <class T> struct is_device_vector:std::false_type {};
result.first = read_as<int>(is);
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};
return result;
}


class SomeClassCuda
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
{
{
cu::vector<ComplexObject> container;
using size_type = typename std::vector<typename T::value_type>::size_type;
public:
auto count = read_as<size_type>(is);
 
auto v = T(count);
typedef std::true_type custom_cuda_device_read;
for (size_type i = 0; i < count; ++i)
v[i] = read_as<typename T::value_type>(is);
template <class T>
return v;
friend __device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<std::is_same<SomeClassCuda, T>::value, T>;
}
};


template <class T>
template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<std::is_same<SomeClassCuda, T>::value, T>
__global__ void sum_krnl(cuda_input_buffer buf, T* result)
{
{
SomeClassCuda result;
auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
 
static device_vector<T>* pv;
std::size_t containerSize = read_as<std::size_t>(is);
static bool fStop = false;
for (std::size_t i = 0; i < containerSize; ++i)
if (iThread == 0)
result.container.emplace_back(read_as<ComplexObject>(is));
{
auto is = cuda_input_buffer_stream_d(buf);
return result;
pv = new device_vector<T>(read_as<device_vector<T>>(is));
}
__syncthreads();
auto& v = *pv;
auto iElement = iThread * 2;
std::size_t iter = 0;
while (!fStop)
{
auto iNextElement = iElement + (std::size_t(1) << iter++);
if (iNextElement < v.size())
v[iElement] += v[iNextElement];
else if (iThread == 0)
fStop = true;
__syncthreads();
}
__syncthreads();
if (iThread == 0)
{
*result = v[iElement];
delete pv;
}
}
}
#endif //__CUDACC__


class SomeClass
#include <iostream>
{
#include <random>
std::vector<ComplexObject> container;
#ifdef __CUDACC__
friend cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const SomeClass& obj);
#endif //__CUDACC__
};


#ifdef __CUDACC__
int main()
__host__ cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const SomeClass& obj)
{
{
is << obj.container.size();
auto sum = [](const auto& v)
for (std::size_t i = 0; i < obj.container.size(); ++i)
{
is << obj.container[i];
cuda_input_buffer_stream os;
return is;
os << v;
using value_type = typename std::decay_t<decltype(v)>::value_type;
value_type* pResult;
auto err = cudaDeviceReset();
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaMalloc(&pResult, sizeof(value_type));
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
value_type result;
sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
err = cudaFree(pResult);
if (err != cudaSuccess)
throw std::runtime_error("CUDA exception");
return result;
};
std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
std::minstd_rand rnd;
std::vector<float> v_flt;
for (auto i = 0; i < 1000; ++i)
v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
return 0;
}
}
#endif //__CUDACC__
</source>
</source>

Текущая версия на 18:24, 23 июня 2019

#include <vector>

template <class T>
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
{
	is << v.size();
	for (auto val:v)
		is << val;
	return is;
}

template <class T>
struct device_vector
{
	std::size_t m_count = std::size_t();
	T* m_pElements = nullptr;
public:
	typedef T value_type, *pointer, &reference;
	using custom_cuda_device_read = std::true_type;
	device_vector() = default;
	__device__ explicit device_vector(std::size_t c, const T& init = T())
	{
		m_pElements = new T[c];
		if (m_pElements == nullptr) return;
		fill(m_pElements, m_pElements + c, init);
		m_count = c;
	}
	__device__ device_vector(const T* data, std::size_t c)
	{
		m_pElements = new T[c];
		if (m_pElements == nullptr) return;
		copy(data, data + c, m_pElements);
		m_count = c;
	}
	__device__ ~device_vector()
	{
		if (m_pElements != nullptr)
			delete [] m_pElements;
	}
	__device__ device_vector(const device_vector& right) {*this = right;}
	__device__ device_vector(device_vector&& right) {*this = std::move(right);}
	__device__ device_vector& operator=(device_vector&& right)
	{
		if (this == &right)
			return *this;
		if (m_pElements != nullptr)
			delete [] m_pElements;
		m_pElements = right.m_pElements;
		right.m_pElements = nullptr;
		m_count = right.m_count;
		right.m_count = std::size_t();
		return *this;
	}
	__device__ device_vector& operator=(const device_vector& right)
	{
		if (this == &right)
			return *this;
		auto data = new T[right.size()];
		memcpy(data, right.data(), c);
		return *this = device_vector(data, c);
	}
	__device__ const T* data() const
	{
		return m_pElements;
	}
	__device__ std::size_t size() const
	{
		return m_count;
	}
	__device__ const T& operator[](std::size_t i) const
	{
		return m_pElements[i];
	}
	__device__ T& operator[](std::size_t i)
	{
		return m_pElements[i];
	}
};

template <class T> struct is_device_vector:std::false_type {};
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};

template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
{
	using size_type = typename std::vector<typename T::value_type>::size_type;
	auto count = read_as<size_type>(is);
	auto v = T(count);
	for (size_type i = 0; i < count; ++i)
		v[i] = read_as<typename T::value_type>(is);
	return v;
}

template <class T>
__global__ void sum_krnl(cuda_input_buffer buf, T* result)
{
	auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
	static device_vector<T>* pv;
	static bool fStop = false;
	if (iThread == 0)
	{
		auto is = cuda_input_buffer_stream_d(buf);
		pv = new device_vector<T>(read_as<device_vector<T>>(is));
	}
	__syncthreads();
	auto& v = *pv;
	auto iElement = iThread * 2;
	std::size_t iter = 0;
	while (!fStop)
	{
		auto iNextElement = iElement + (std::size_t(1) << iter++);
		if (iNextElement < v.size())
			v[iElement] += v[iNextElement];
		else if (iThread == 0)
			fStop = true;
		__syncthreads();
		
	}
	__syncthreads();
	if (iThread == 0)
	{
		*result = v[iElement];
		delete pv;
	}
}

#include <iostream>
#include <random>

int main()
{
	auto sum = [](const auto& v)
	{
		cuda_input_buffer_stream os;
		os << v;
		using value_type = typename std::decay_t<decltype(v)>::value_type;
		value_type* pResult;
		auto err = cudaDeviceReset();
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaMalloc(&pResult, sizeof(value_type));
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		value_type result;
		sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
		err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaDeviceSynchronize();
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaFree(pResult);
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		return result;
	};
	std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
	std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
	std::minstd_rand rnd;
	std::vector<float> v_flt;
	for (auto i = 0; i < 1000; ++i)
		v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
	std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
	return 0;
}