Автоматизация ввода-вывода CUDA/Пример внедрения автоматизации: различия между версиями

Текущая версия на 18:24, 23 июня 2019

#include <vector>

template <class T>
cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
{
	is << v.size();
	for (auto val:v)
		is << val;
	return is;
}

template <class T>
struct device_vector
{
	std::size_t m_count = std::size_t();
	T* m_pElements = nullptr;
public:
	typedef T value_type, *pointer, &reference;
	using custom_cuda_device_read = std::true_type;
	device_vector() = default;
	__device__ explicit device_vector(std::size_t c, const T& init = T())
	{
		m_pElements = new T[c];
		if (m_pElements == nullptr) return;
		fill(m_pElements, m_pElements + c, init);
		m_count = c;
	}
	__device__ device_vector(const T* data, std::size_t c)
	{
		m_pElements = new T[c];
		if (m_pElements == nullptr) return;
		copy(data, data + c, m_pElements);
		m_count = c;
	}
	__device__ ~device_vector()
	{
		if (m_pElements != nullptr)
			delete [] m_pElements;
	}
	__device__ device_vector(const device_vector& right) {*this = right;}
	__device__ device_vector(device_vector&& right) {*this = std::move(right);}
	__device__ device_vector& operator=(device_vector&& right)
	{
		if (this == &right)
			return *this;
		if (m_pElements != nullptr)
			delete [] m_pElements;
		m_pElements = right.m_pElements;
		right.m_pElements = nullptr;
		m_count = right.m_count;
		right.m_count = std::size_t();
		return *this;
	}
	__device__ device_vector& operator=(const device_vector& right)
	{
		if (this == &right)
			return *this;
		auto data = new T[right.size()];
		memcpy(data, right.data(), c);
		return *this = device_vector(data, c);
	}
	__device__ const T* data() const
	{
		return m_pElements;
	}
	__device__ std::size_t size() const
	{
		return m_count;
	}
	__device__ const T& operator[](std::size_t i) const
	{
		return m_pElements[i];
	}
	__device__ T& operator[](std::size_t i)
	{
		return m_pElements[i];
	}
};

template <class T> struct is_device_vector:std::false_type {};
template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};

template <class T>
__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
{
	using size_type = typename std::vector<typename T::value_type>::size_type;
	auto count = read_as<size_type>(is);
	auto v = T(count);
	for (size_type i = 0; i < count; ++i)
		v[i] = read_as<typename T::value_type>(is);
	return v;
}

template <class T>
__global__ void sum_krnl(cuda_input_buffer buf, T* result)
{
	auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
	static device_vector<T>* pv;
	static bool fStop = false;
	if (iThread == 0)
	{
		auto is = cuda_input_buffer_stream_d(buf);
		pv = new device_vector<T>(read_as<device_vector<T>>(is));
	}
	__syncthreads();
	auto& v = *pv;
	auto iElement = iThread * 2;
	std::size_t iter = 0;
	while (!fStop)
	{
		auto iNextElement = iElement + (std::size_t(1) << iter++);
		if (iNextElement < v.size())
			v[iElement] += v[iNextElement];
		else if (iThread == 0)
			fStop = true;
		__syncthreads();
		
	}
	__syncthreads();
	if (iThread == 0)
	{
		*result = v[iElement];
		delete pv;
	}
}

#include <iostream>
#include <random>

int main()
{
	auto sum = [](const auto& v)
	{
		cuda_input_buffer_stream os;
		os << v;
		using value_type = typename std::decay_t<decltype(v)>::value_type;
		value_type* pResult;
		auto err = cudaDeviceReset();
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaMalloc(&pResult, sizeof(value_type));
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		value_type result;
		sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
		err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaDeviceSynchronize();
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		err = cudaFree(pResult);
		if (err != cudaSuccess)
			throw std::runtime_error("CUDA exception");
		return result;
	};
	std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
	std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
	std::minstd_rand rnd;
	std::vector<float> v_flt;
	for (auto i = 0; i < 1000; ++i)
		v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
	std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
	return 0;
}

@@ Строка 1: / Строка 1: @@
 <source lang="cpp">
 #include <vector>
-#include <cuda/vector.cuh>
-#include <cuda/automation>
-class SomeClass;
-#ifdef __CUDACC__
-class SomeClassCuda;
-template <class T>
-__device__ auto read_as(cuda_input_buffer_stream_d& is)->std::enable_if_t<std::is_same<SomeClassCuda, T>::value, T>;
 template <class T>
-__device__ auto read_as(cuda_input_buffer_stream_d& is)->std::enable_if_t<std::is_same<ComplexObject, T>::value, T>;
+cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const std::vector<T>& v)
-#endif  //__CUDACC__
-struct ComplexObject
 {
-	int first;
+	is << v.size();
-	int second;
+	for (auto val:v)
-	#ifdef __CUDACC__
+		is << val;
-	template <class T>
-	friend __device__ auto read_as(cuda_input_buffer_stream_d& is)->std::enable_if_t<std::is_same<ComplexObject, T>::value, T>;
-	friend cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const ComplexObject& obj);
-	#endif  //__CUDACC__
-};
-#ifdef __CUDACC__
-__host__ cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const ComplexObject& obj)
-{
-	is << first;
-	is << second;
 	return is;
 }
 template <class T>
-__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<std::is_same<ComplexObject, T>::value, T>
+struct device_vector
 {
-	ComplexObject result;
+	std::size_t m_count = std::size_t();
+	T* m_pElements = nullptr;
+public:
+	typedef T value_type, *pointer, &reference;
+	using custom_cuda_device_read = std::true_type;
+	device_vector() = default;
+	__device__ explicit device_vector(std::size_t c, const T& init = T())
+	{
+		m_pElements = new T[c];
+		if (m_pElements == nullptr) return;
+		fill(m_pElements, m_pElements + c, init);
+		m_count = c;
+	}
+	__device__ device_vector(const T* data, std::size_t c)
+	{
+		m_pElements = new T[c];
+		if (m_pElements == nullptr) return;
+		copy(data, data + c, m_pElements);
+		m_count = c;
+	}
+	__device__ ~device_vector()
+	{
+		if (m_pElements != nullptr)
+			delete [] m_pElements;
+	}
+	__device__ device_vector(const device_vector& right) {*this = right;}
+	__device__ device_vector(device_vector&& right) {*this = std::move(right);}
+	__device__ device_vector& operator=(device_vector&& right)
+	{
+		if (this == &right)
+			return *this;
+		if (m_pElements != nullptr)
+			delete [] m_pElements;
+		m_pElements = right.m_pElements;
+		right.m_pElements = nullptr;
+		m_count = right.m_count;
+		right.m_count = std::size_t();
+		return *this;
+	}
+	__device__ device_vector& operator=(const device_vector& right)
+	{
+		if (this == &right)
+			return *this;
+		auto data = new T[right.size()];
+		memcpy(data, right.data(), c);
+		return *this = device_vector(data, c);
+	}
+	__device__ const T* data() const
+	{
+		return m_pElements;
+	}
+	__device__ std::size_t size() const
+	{
+		return m_count;
+	}
+	__device__ const T& operator[](std::size_t i) const
+	{
+		return m_pElements[i];
+	}
+	__device__ T& operator[](std::size_t i)
+	{
+		return m_pElements[i];
+	}
+};
-	result.first = read_as<int>(is);
+template <class T> struct is_device_vector:std::false_type {};
-	result.first = read_as<int>(is);
+template <class T> struct is_device_vector<device_vector<T>>:std::true_type {};
-	return result;
-}
-class SomeClassCuda
+template <class T>
+__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<is_device_vector<T>::value, T>
 {
-	cu::vector<ComplexObject> container;
+	using size_type = typename std::vector<typename T::value_type>::size_type;
-public:
+	auto count = read_as<size_type>(is);
+	auto v = T(count);
-	typedef std::true_type custom_cuda_device_read;
+	for (size_type i = 0; i < count; ++i)
+		v[i] = read_as<typename T::value_type>(is);
-	template <class T>
+	return v;
-	friend __device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<std::is_same<SomeClassCuda, T>::value, T>;
+}
-};
 template <class T>
-__device__ auto read_as(cuda_input_buffer_stream_d& is) -> std::enable_if_t<std::is_same<SomeClassCuda, T>::value, T>
+__global__ void sum_krnl(cuda_input_buffer buf, T* result)
 {
-	SomeClassCuda result;
+	auto iThread = std::size_t(blockIdx.x * blockDim.x + threadIdx.x);
+	static device_vector<T>* pv;
-	std::size_t containerSize = read_as<std::size_t>(is);
+	static bool fStop = false;
-	for (std::size_t i = 0; i < containerSize; ++i)
+	if (iThread == 0)
-		result.container.emplace_back(read_as<ComplexObject>(is));
+	{
+		auto is = cuda_input_buffer_stream_d(buf);
-	return result;
+		pv = new device_vector<T>(read_as<device_vector<T>>(is));
+	}
+	__syncthreads();
+	auto& v = *pv;
+	auto iElement = iThread * 2;
+	std::size_t iter = 0;
+	while (!fStop)
+	{
+		auto iNextElement = iElement + (std::size_t(1) << iter++);
+		if (iNextElement < v.size())
+			v[iElement] += v[iNextElement];
+		else if (iThread == 0)
+			fStop = true;
+		__syncthreads();
+	}
+	__syncthreads();
+	if (iThread == 0)
+	{
+		*result = v[iElement];
+		delete pv;
+	}
 }
-#endif //__CUDACC__
-class SomeClass
+#include <iostream>
-{
+#include <random>
-	std::vector<ComplexObject> container;
-#ifdef __CUDACC__
-	friend cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const SomeClass& obj);
-#endif //__CUDACC__
-};
-#ifdef __CUDACC__
+int main()
-__host__ cuda_input_buffer_stream& operator<<(cuda_input_buffer_stream& is, const SomeClass& obj)
 {
-	is << obj.container.size();
+	auto sum = [](const auto& v)
-	for (std::size_t i = 0; i < obj.container.size(); ++i)
+	{
-			is << obj.container[i];
+		cuda_input_buffer_stream os;
-	return is;
+		os << v;
+		using value_type = typename std::decay_t<decltype(v)>::value_type;
+		value_type* pResult;
+		auto err = cudaDeviceReset();
+		if (err != cudaSuccess)
+			throw std::runtime_error("CUDA exception");
+		err = cudaMalloc(&pResult, sizeof(value_type));
+		if (err != cudaSuccess)
+			throw std::runtime_error("CUDA exception");
+		value_type result;
+		sum_krnl<value_type><<<1, 1000>>>(std::move(os).get_cuda_buf(), pResult);
+		err = cudaMemcpy(&result, pResult, sizeof(value_type), cudaMemcpyDeviceToHost);
+		if (err != cudaSuccess)
+			throw std::runtime_error("CUDA exception");
+		err = cudaDeviceSynchronize();
+		if (err != cudaSuccess)
+			throw std::runtime_error("CUDA exception");
+		err = cudaFree(pResult);
+		if (err != cudaSuccess)
+			throw std::runtime_error("CUDA exception");
+		return result;
+	};
+	std::vector<int> v_int = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+	std::cout << "Average of ints: " << sum(v_int) / v_int.size() << "\n";
+	std::minstd_rand rnd;
+	std::vector<float> v_flt;
+	for (auto i = 0; i < 1000; ++i)
+		v_flt.emplace_back(std::generate_canonical<float, 16>(rnd));
+	std::cout << "Average of floats: " << sum(v_flt) / v_flt.size() << "\n";
+	return 0;
 }
-#endif //__CUDACC__
 </source>

Автоматизация ввода-вывода CUDA/Пример внедрения автоматизации: различия между версиями

Текущая версия на 18:24, 23 июня 2019

Навигация

Поиск