From 32dd41a728ce10113032e20955ba08f8de449857 Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Sun, 12 Sep 2021 14:01:55 +0200 Subject: Start using C++ cuda-api-wrapper instead of raw CUDA --- tangle/LLBM/kernel/executor.h | 37 ---------------------- tangle/LLBM/lattice.h | 73 ++++++++++++++++++++----------------------- tangle/LLBM/materials.h | 2 +- tangle/LLBM/memory.h | 29 +++++++++-------- tangle/LLBM/propagate.h | 32 ++++++++++++------- 5 files changed, 70 insertions(+), 103 deletions(-) (limited to 'tangle/LLBM') diff --git a/tangle/LLBM/kernel/executor.h b/tangle/LLBM/kernel/executor.h index 942918d..cce023b 100644 --- a/tangle/LLBM/kernel/executor.h +++ b/tangle/LLBM/kernel/executor.h @@ -30,30 +30,6 @@ __global__ void call_operator( } } -template -__global__ void call_operator( - LatticeView lattice - , bool* mask - , ARGS... args -) { - const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x; - if (!(gid < lattice.cuboid.volume) || !mask[gid]) { - return; - } - - S f_curr[DESCRIPTOR::q]; - S f_next[DESCRIPTOR::q]; - S* preshifted_f[DESCRIPTOR::q]; - for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - preshifted_f[iPop] = lattice.pop(iPop, gid); - f_curr[iPop] = *preshifted_f[iPop]; - } - OPERATOR::template apply(DESCRIPTOR(), f_curr, f_next, gid, std::forward(args)...); - for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - *preshifted_f[iPop] = f_next[iPop]; - } -} - template __global__ void call_functor( LatticeView lattice @@ -110,19 +86,6 @@ __global__ void call_operator_using_list( OPERATOR::template apply(lattice, index, count, std::forward(args)...); } -template -__global__ void call_operator_using_list( - DESCRIPTOR descriptor - , std::size_t count - , ARGS... args -) { - const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x; - if (!(index < count)) { - return; - } - OPERATOR::template apply(descriptor, index, count, std::forward(args)...); -} - template __global__ void call_spatial_functor( LatticeView lattice diff --git a/tangle/LLBM/lattice.h b/tangle/LLBM/lattice.h index 7157c78..d3a1840 100644 --- a/tangle/LLBM/lattice.h +++ b/tangle/LLBM/lattice.h @@ -31,9 +31,10 @@ template void apply(OPERATOR... ops) { const auto block_size = 32; const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_operators<<>>( - _population.view(), ops... - ); + cuda::launch(kernel::call_operators, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + ops...); } template @@ -45,27 +46,22 @@ template void call_operator(tag::call_by_cell_id, DeviceBuffer& cells, ARGS... args) { const auto block_size = 32; const auto block_count = (cells.size() + block_size - 1) / block_size; - kernel::call_operator<<>>( - _population.view(), cells.device(), cells.size(), std::forward(args)... - ); -} - -template -void call_operator(tag::call_by_cell_id, DeviceBuffer& mask, ARGS... args) { - const auto block_size = 32; - const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_operator<<>>( - _population.view(), mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_operator, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + cells.device(), cells.size(), + std::forward(args)...); } template void call_operator(tag::call_by_list_index, std::size_t count, ARGS... args) { const auto block_size = 32; const auto block_count = (count + block_size - 1) / block_size; - kernel::call_operator_using_list<<>>( - _population.view(), count, std::forward(args)... - ); + cuda::launch(kernel::call_operator_using_list, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + count, + std::forward(args)...); } template @@ -73,22 +69,15 @@ void inspect(ARGS&&... args) { call_functor(typename FUNCTOR::call_tag{}, std::forward(args)...); } -template -void call_functor(tag::call_by_cell_id, DeviceBuffer& cells, ARGS... args) { - const auto block_size = 32; - const auto block_count = (cells.size() + block_size - 1) / block_size; - kernel::call_functor<<>>( - _population.view(), cells.device(), cells.size(), std::forward(args)... - ); -} - template void call_functor(tag::call_by_cell_id, DeviceBuffer& mask, ARGS... args) { const auto block_size = 32; const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_functor<<>>( - _population.view(), mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_functor, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + mask.device(), + std::forward(args)...); } template @@ -97,9 +86,11 @@ void call_functor(tag::call_by_spatial_cell_mask, DeviceBuffer& mask, ARGS const dim3 grid((_cuboid.nX + block.x - 1) / block.x, (_cuboid.nY + block.y - 1) / block.y, (_cuboid.nZ + block.z - 1) / block.z); - kernel::call_spatial_functor<<>>( - _population.view(), mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_spatial_functor, + cuda::launch_configuration_t(grid, block), + _population.view(), + mask.device(), + std::forward(args)...); } template @@ -111,9 +102,11 @@ template void tagged_helper(tag::post_process_by_list_index, std::size_t count, ARGS... args) { const auto block_size = 32; const auto block_count = (count + block_size - 1) / block_size; - kernel::call_operator_using_list<<>>( - DESCRIPTOR(), count, std::forward(args)... - ); + cuda::launch(kernel::call_operator_using_list, + cuda::launch_configuration_t(block_count, block_size), + DESCRIPTOR(), + count, + std::forward(args)...); } template @@ -122,9 +115,11 @@ void tagged_helper(tag::post_process_by_spatial_cell_mask, DeviceBuffer& m const dim3 grid((_cuboid.nX + block.x - 1) / block.x, (_cuboid.nY + block.y - 1) / block.y, (_cuboid.nZ + block.z - 1) / block.z); - kernel::call_spatial_operator<<>>( - _cuboid, mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_spatial_operator, + cuda::launch_configuration_t(grid, block), + _cuboid, + mask.device(), + std::forward(args)...); } }; diff --git a/tangle/LLBM/materials.h b/tangle/LLBM/materials.h index d782d8d..5734688 100644 --- a/tangle/LLBM/materials.h +++ b/tangle/LLBM/materials.h @@ -24,7 +24,7 @@ public: descriptor::Cuboid cuboid() const { return _cuboid; }; - + int get(std::size_t iCell) const { return _materials[iCell]; } diff --git a/tangle/LLBM/memory.h b/tangle/LLBM/memory.h index 97fec5c..149034b 100644 --- a/tangle/LLBM/memory.h +++ b/tangle/LLBM/memory.h @@ -4,31 +4,30 @@ #include #include +#include + template class DeviceBuffer { protected: const std::size_t _size; - T* _data; + cuda::device_t _device; + cuda::memory::device::unique_ptr _data; public: DeviceBuffer(std::size_t size): - _size(size) { - cudaMalloc(&_data, _size*sizeof(T)); - cudaMemset(_data, 0, _size*sizeof(T)); - } + _size(size), + _device(cuda::device::current::get()), + _data(cuda::memory::device::make_unique(_device, size)) + { } DeviceBuffer(const T* data, std::size_t size): DeviceBuffer(size) { - cudaMemcpy(_data, data, size*sizeof(T), cudaMemcpyHostToDevice); + cuda::memory::copy(_data.get(), data, size*sizeof(T)); } DeviceBuffer(const std::vector& data): DeviceBuffer(data.data(), data.size()) { } - - ~DeviceBuffer() { - cudaFree(_data); - } T* device() { - return _data; + return _data.get(); } std::size_t size() const { @@ -40,7 +39,7 @@ template class SharedVector : public DeviceBuffer { private: std::unique_ptr _host_data; - + public: SharedVector(std::size_t size): DeviceBuffer(size), @@ -57,11 +56,11 @@ public: } void syncHostFromDevice() { - cudaMemcpy(_host_data.get(), this->_data, this->_size*sizeof(T), cudaMemcpyDeviceToHost); + cuda::memory::copy(_host_data.get(), this->_data.get(), this->_size*sizeof(T)); } void syncDeviceFromHost() { - cudaMemcpy(this->_data, _host_data.get(), this->_size*sizeof(T), cudaMemcpyHostToDevice); + cuda::memory::copy(this->_data.get(), _host_data.get(), this->_size*sizeof(T)); } }; @@ -103,7 +102,7 @@ public: DeviceTexture(descriptor::CuboidD<3> c): DeviceTexture(c.nX, c.nY, c.nZ) { } - + ~DeviceTexture() { cudaFreeArray(_array); } diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h index acb1d6c..91b1131 100644 --- a/tangle/LLBM/propagate.h +++ b/tangle/LLBM/propagate.h @@ -5,6 +5,7 @@ #include "kernel/propagate.h" #include +#include template struct LatticeView { @@ -37,6 +38,7 @@ protected: public: CyclicPopulationBuffer(descriptor::Cuboid cuboid); + ~CyclicPopulationBuffer(); LatticeView view() { return LatticeView{ _cuboid, _population.device() }; @@ -46,15 +48,13 @@ public: }; -std::size_t getDevicePageSize(int device_id=-1) { - if (device_id == -1) { - cudaGetDevice(&device_id); - } +std::size_t getDevicePageSize() { + auto device = cuda::device::current::get(); std::size_t granularity = 0; CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device_id; + prop.location.id = device.id(); cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); return granularity; } @@ -69,12 +69,11 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( _population(DESCRIPTOR::q) { - int device_id = -1; - cudaGetDevice(&device_id); + auto device = cuda::device::current::get(); _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _prop.location.id = device_id; + _prop.location.id = device.id(); cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0); for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { @@ -85,7 +84,7 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( } _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _access.location.id = 0; + _access.location.id = device.id(); _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1); @@ -95,7 +94,7 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( } for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S)); + _base[iPop] = this->device() + iPop * 2 * (_volume / sizeof(S)); _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q); } @@ -103,6 +102,15 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( _population.syncDeviceFromHost(); } +template +CyclicPopulationBuffer::~CyclicPopulationBuffer() { + cuMemUnmap(_ptr, 2 * _volume * DESCRIPTOR::q); + for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { + cuMemRelease(_handle[iPop]); + } + cuMemAddressFree(_ptr, 2 * _volume * DESCRIPTOR::q); +} + template __device__ __forceinline__ S* LatticeView::pop(pop_index_t iPop, std::size_t gid) const { @@ -111,5 +119,7 @@ S* LatticeView::pop(pop_index_t iPop, std::size_t gid) const { template void CyclicPopulationBuffer::stream() { - propagate<<<1,1>>>(view(), _base.device(), _volume / sizeof(S)); + cuda::launch(propagate, + cuda::launch_configuration_t(1,1), + view(), _base.device(), _volume / sizeof(S)); } -- cgit v1.2.3