Автоматизация ввода-вывода CUDA/Пример внедрения автоматизации

Материал из CAMaaS preliminary wiki
Перейти к навигации Перейти к поиску
#include <vector>

template <class T>
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
{
	is << v.size();
	for (auto val:v)
		is << val;
	return is;
}

template <class T>
struct device_vector
{
	std::size_t m_count = std::size_t();
	T* m_pElements = nullptr;
public:
	typedef T value_type, *pointer, &reference;
	using custom_cuda_device_read = std::true_type;
	device_vector() = default;
	__device__ explicit device_vector(std::size_t c, const T& init = T())
	{
		m_pElements = new T[c];
		if (m_pElements == nullptr) return;
		fill(m_pElements, m_pElements + c, init);
		m_count = c;
	}
	__device__ device_vector(const T* data, std::size_t c)
	{
		m_pElements = new T[c];
		if (m_pElements == nullptr) return;
		copy(data, data + c, m_pElements);
		m_count = c;
	}
	__device__ ~device_vector()
	{
		if (m_pElements != nullptr)
			delete [] m_pElements;
	}
	__device__ device_vector(const device_vector& right) {*this = right;}
	__device__ device_vector(device_vector&& right) {*this = std::move(right);}
	__device__ device_vector& operator=(device_vector&& right)
	{
		if (this == &right)
			return *this;
		if (m_pElements != nullptr)
			delete [] m_pElements;
		m_pElements = right.m_pElements;
		right.m_pElements = nullptr;
		m_count = right.m_count;
		right.m_count = std::size_t();
		return *this;
	}
	__device__ device_vector& operator=(const device_vector& right)
	{
		if (this == &right)
			return *this;
		auto data = new T[right.size()];
		memcpy(data, right.data(), c);
		return *this = device_vector(data, c);
	}
	__device__ const T* data() const
	{
		return m_pElements;
	}
	__device__ std::size_t size() const
	{
		return m_count;
	}
	__device__ const T& operator[](std::size_t i) const
	{
		return m_pElements[i];
	}
	__device__ T& operator[](std::size_t i)
	{
		return m_pElements[i];
	}
};

template <class T> struct is_device_vector:std::false_type {};
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};

template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
{
	using size_type = typename std::vector<typename T::value_type>::size_type;
	auto count = read_as<size_type>(is);
	auto v = T(count);
	for (size_type i = 0; i < count; ++i)
		v[i] = read_as<typename T::value_type>(is);
	return v;
}

template <class T>
__global__ void sum_krnl(cuda_input_buffer buf, T* result)
{
	auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
	static device_vector<T>* pv;
	static bool fStop = false;
	if (iThread == 0)
	{
		auto is = cuda_input_buffer_stream_d(buf);
		pv = new device_vector<T>(read_as<device_vector<T>>(is));
	}
	__syncthreads();
	auto& v = *pv;
	auto iElement = iThread * 2;
	std::size_t iter = 0;
	while (!fStop)
	{
		auto iNextElement = iElement + (std::size_t(1) << iter++);
		if (iNextElement < v.size())
			v[iElement] += v[iNextElement];
		else if (iThread == 0)
			fStop = true;
		__syncthreads();
		
	}
	__syncthreads();
	if (iThread == 0)
	{
		*result = v[iElement];
		delete pv;
	}
}

#include <iostream>
#include <random>

int main()
{
	auto sum = [](const auto& v)
	{
		cuda_input_buffer_stream os;
		os << v;
		using value_type = typename std::decay_t<decltype(v)>::value_type;
		value_type* pResult;
		auto err = cudaDeviceReset();
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaMalloc(&pResult, sizeof(value_type));
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		value_type result;
		sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
		err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaDeviceSynchronize();
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaFree(pResult);
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		return result;
	};
	std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
	std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
	std::minstd_rand rnd;
	std::vector<float> v_flt;
	for (auto i = 0; i < 1000; ++i)
		v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
	std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
	return 0;
}