diff options
author | Adrian Kummerlaender | 2021-09-12 14:01:55 +0200 |
---|---|---|
committer | Adrian Kummerlaender | 2021-09-12 14:01:55 +0200 |
commit | 32dd41a728ce10113032e20955ba08f8de449857 (patch) | |
tree | 8a5f851a342590d05137b2c900a7d3c4e8c545b6 /tangle/LLBM | |
parent | 8bca21a550e0ef134d51c4c4c007720885d76791 (diff) | |
download | LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.tar LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.tar.gz LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.tar.bz2 LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.tar.lz LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.tar.xz LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.tar.zst LiterateLB-32dd41a728ce10113032e20955ba08f8de449857.zip |
Start using C++ cuda-api-wrapper instead of raw CUDA
Diffstat (limited to 'tangle/LLBM')
-rw-r--r-- | tangle/LLBM/kernel/executor.h | 37 | ||||
-rw-r--r-- | tangle/LLBM/lattice.h | 73 | ||||
-rw-r--r-- | tangle/LLBM/materials.h | 2 | ||||
-rw-r--r-- | tangle/LLBM/memory.h | 29 | ||||
-rw-r--r-- | tangle/LLBM/propagate.h | 32 |
5 files changed, 70 insertions, 103 deletions
diff --git a/tangle/LLBM/kernel/executor.h b/tangle/LLBM/kernel/executor.h index 942918d..cce023b 100644 --- a/tangle/LLBM/kernel/executor.h +++ b/tangle/LLBM/kernel/executor.h @@ -30,30 +30,6 @@ __global__ void call_operator( } } -template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS> -__global__ void call_operator( - LatticeView<DESCRIPTOR,S> lattice - , bool* mask - , ARGS... args -) { - const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x; - if (!(gid < lattice.cuboid.volume) || !mask[gid]) { - return; - } - - S f_curr[DESCRIPTOR::q]; - S f_next[DESCRIPTOR::q]; - S* preshifted_f[DESCRIPTOR::q]; - for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - preshifted_f[iPop] = lattice.pop(iPop, gid); - f_curr[iPop] = *preshifted_f[iPop]; - } - OPERATOR::template apply<T,S>(DESCRIPTOR(), f_curr, f_next, gid, std::forward<ARGS>(args)...); - for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - *preshifted_f[iPop] = f_next[iPop]; - } -} - template <typename FUNCTOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS> __global__ void call_functor( LatticeView<DESCRIPTOR,S> lattice @@ -110,19 +86,6 @@ __global__ void call_operator_using_list( OPERATOR::template apply<T,S>(lattice, index, count, std::forward<ARGS>(args)...); } -template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS> -__global__ void call_operator_using_list( - DESCRIPTOR descriptor - , std::size_t count - , ARGS... args -) { - const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x; - if (!(index < count)) { - return; - } - OPERATOR::template apply<T,S>(descriptor, index, count, std::forward<ARGS>(args)...); -} - template <typename FUNCTOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS> __global__ void call_spatial_functor( LatticeView<DESCRIPTOR,S> lattice diff --git a/tangle/LLBM/lattice.h b/tangle/LLBM/lattice.h index 7157c78..d3a1840 100644 --- a/tangle/LLBM/lattice.h +++ b/tangle/LLBM/lattice.h @@ -31,9 +31,10 @@ template <typename... OPERATOR> void apply(OPERATOR... ops) { const auto block_size = 32; const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_operators<DESCRIPTOR,T,S,OPERATOR...><<<block_count,block_size>>>( - _population.view(), ops... - ); + cuda::launch(kernel::call_operators<DESCRIPTOR,T,S,OPERATOR...>, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + ops...); } template <typename OPERATOR, typename... ARGS> @@ -45,27 +46,22 @@ template <typename OPERATOR, typename... ARGS> void call_operator(tag::call_by_cell_id, DeviceBuffer<std::size_t>& cells, ARGS... args) { const auto block_size = 32; const auto block_count = (cells.size() + block_size - 1) / block_size; - kernel::call_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>( - _population.view(), cells.device(), cells.size(), std::forward<ARGS>(args)... - ); -} - -template <typename OPERATOR, typename... ARGS> -void call_operator(tag::call_by_cell_id, DeviceBuffer<bool>& mask, ARGS... args) { - const auto block_size = 32; - const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>( - _population.view(), mask.device(), std::forward<ARGS>(args)... - ); + cuda::launch(kernel::call_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...>, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + cells.device(), cells.size(), + std::forward<ARGS>(args)...); } template <typename OPERATOR, typename... ARGS> void call_operator(tag::call_by_list_index, std::size_t count, ARGS... args) { const auto block_size = 32; const auto block_count = (count + block_size - 1) / block_size; - kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>( - _population.view(), count, std::forward<ARGS>(args)... - ); + cuda::launch(kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...>, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + count, + std::forward<ARGS>(args)...); } template <typename FUNCTOR, typename... ARGS> @@ -74,21 +70,14 @@ void inspect(ARGS&&... args) { } template <typename FUNCTOR, typename... ARGS> -void call_functor(tag::call_by_cell_id, DeviceBuffer<std::size_t>& cells, ARGS... args) { - const auto block_size = 32; - const auto block_count = (cells.size() + block_size - 1) / block_size; - kernel::call_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>( - _population.view(), cells.device(), cells.size(), std::forward<ARGS>(args)... - ); -} - -template <typename FUNCTOR, typename... ARGS> void call_functor(tag::call_by_cell_id, DeviceBuffer<bool>& mask, ARGS... args) { const auto block_size = 32; const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>( - _population.view(), mask.device(), std::forward<ARGS>(args)... - ); + cuda::launch(kernel::call_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...>, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + mask.device(), + std::forward<ARGS>(args)...); } template <typename FUNCTOR, typename... ARGS> @@ -97,9 +86,11 @@ void call_functor(tag::call_by_spatial_cell_mask, DeviceBuffer<bool>& mask, ARGS const dim3 grid((_cuboid.nX + block.x - 1) / block.x, (_cuboid.nY + block.y - 1) / block.y, (_cuboid.nZ + block.z - 1) / block.z); - kernel::call_spatial_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...><<<grid,block>>>( - _population.view(), mask.device(), std::forward<ARGS>(args)... - ); + cuda::launch(kernel::call_spatial_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...>, + cuda::launch_configuration_t(grid, block), + _population.view(), + mask.device(), + std::forward<ARGS>(args)...); } template <typename OPERATOR, typename... ARGS> @@ -111,9 +102,11 @@ template <typename OPERATOR, typename... ARGS> void tagged_helper(tag::post_process_by_list_index, std::size_t count, ARGS... args) { const auto block_size = 32; const auto block_count = (count + block_size - 1) / block_size; - kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>( - DESCRIPTOR(), count, std::forward<ARGS>(args)... - ); + cuda::launch(kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...>, + cuda::launch_configuration_t(block_count, block_size), + DESCRIPTOR(), + count, + std::forward<ARGS>(args)...); } template <typename OPERATOR, typename... ARGS> @@ -122,9 +115,11 @@ void tagged_helper(tag::post_process_by_spatial_cell_mask, DeviceBuffer<bool>& m const dim3 grid((_cuboid.nX + block.x - 1) / block.x, (_cuboid.nY + block.y - 1) / block.y, (_cuboid.nZ + block.z - 1) / block.z); - kernel::call_spatial_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<grid,block>>>( - _cuboid, mask.device(), std::forward<ARGS>(args)... - ); + cuda::launch(kernel::call_spatial_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...>, + cuda::launch_configuration_t(grid, block), + _cuboid, + mask.device(), + std::forward<ARGS>(args)...); } }; diff --git a/tangle/LLBM/materials.h b/tangle/LLBM/materials.h index d782d8d..5734688 100644 --- a/tangle/LLBM/materials.h +++ b/tangle/LLBM/materials.h @@ -24,7 +24,7 @@ public: descriptor::Cuboid<DESCRIPTOR> cuboid() const { return _cuboid; }; - + int get(std::size_t iCell) const { return _materials[iCell]; } diff --git a/tangle/LLBM/memory.h b/tangle/LLBM/memory.h index 97fec5c..149034b 100644 --- a/tangle/LLBM/memory.h +++ b/tangle/LLBM/memory.h @@ -4,31 +4,30 @@ #include <vector> #include <cstring> +#include <cuda/runtime_api.hpp> + template <typename T> class DeviceBuffer { protected: const std::size_t _size; - T* _data; + cuda::device_t _device; + cuda::memory::device::unique_ptr<T[]> _data; public: DeviceBuffer(std::size_t size): - _size(size) { - cudaMalloc(&_data, _size*sizeof(T)); - cudaMemset(_data, 0, _size*sizeof(T)); - } + _size(size), + _device(cuda::device::current::get()), + _data(cuda::memory::device::make_unique<T[]>(_device, size)) + { } DeviceBuffer(const T* data, std::size_t size): DeviceBuffer(size) { - cudaMemcpy(_data, data, size*sizeof(T), cudaMemcpyHostToDevice); + cuda::memory::copy(_data.get(), data, size*sizeof(T)); } DeviceBuffer(const std::vector<T>& data): DeviceBuffer(data.data(), data.size()) { } - - ~DeviceBuffer() { - cudaFree(_data); - } T* device() { - return _data; + return _data.get(); } std::size_t size() const { @@ -40,7 +39,7 @@ template <typename T> class SharedVector : public DeviceBuffer<T> { private: std::unique_ptr<T[]> _host_data; - + public: SharedVector(std::size_t size): DeviceBuffer<T>(size), @@ -57,11 +56,11 @@ public: } void syncHostFromDevice() { - cudaMemcpy(_host_data.get(), this->_data, this->_size*sizeof(T), cudaMemcpyDeviceToHost); + cuda::memory::copy(_host_data.get(), this->_data.get(), this->_size*sizeof(T)); } void syncDeviceFromHost() { - cudaMemcpy(this->_data, _host_data.get(), this->_size*sizeof(T), cudaMemcpyHostToDevice); + cuda::memory::copy(this->_data.get(), _host_data.get(), this->_size*sizeof(T)); } }; @@ -103,7 +102,7 @@ public: DeviceTexture(descriptor::CuboidD<3> c): DeviceTexture(c.nX, c.nY, c.nZ) { } - + ~DeviceTexture() { cudaFreeArray(_array); } diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h index acb1d6c..91b1131 100644 --- a/tangle/LLBM/propagate.h +++ b/tangle/LLBM/propagate.h @@ -5,6 +5,7 @@ #include "kernel/propagate.h" #include <cuda.h> +#include <cuda/runtime_api.hpp> template <typename DESCRIPTOR, typename S> struct LatticeView { @@ -37,6 +38,7 @@ protected: public: CyclicPopulationBuffer(descriptor::Cuboid<DESCRIPTOR> cuboid); + ~CyclicPopulationBuffer(); LatticeView<DESCRIPTOR,S> view() { return LatticeView<DESCRIPTOR,S>{ _cuboid, _population.device() }; @@ -46,15 +48,13 @@ public: }; -std::size_t getDevicePageSize(int device_id=-1) { - if (device_id == -1) { - cudaGetDevice(&device_id); - } +std::size_t getDevicePageSize() { + auto device = cuda::device::current::get(); std::size_t granularity = 0; CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device_id; + prop.location.id = device.id(); cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); return granularity; } @@ -69,12 +69,11 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( _population(DESCRIPTOR::q) { - int device_id = -1; - cudaGetDevice(&device_id); + auto device = cuda::device::current::get(); _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _prop.location.id = device_id; + _prop.location.id = device.id(); cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0); for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { @@ -85,7 +84,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( } _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _access.location.id = 0; + _access.location.id = device.id(); _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1); @@ -95,7 +94,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( } for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S)); + _base[iPop] = this->device() + iPop * 2 * (_volume / sizeof(S)); _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q); } @@ -104,6 +103,15 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( } template <typename DESCRIPTOR, typename S> +CyclicPopulationBuffer<DESCRIPTOR,S>::~CyclicPopulationBuffer() { + cuMemUnmap(_ptr, 2 * _volume * DESCRIPTOR::q); + for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { + cuMemRelease(_handle[iPop]); + } + cuMemAddressFree(_ptr, 2 * _volume * DESCRIPTOR::q); +} + +template <typename DESCRIPTOR, typename S> __device__ __forceinline__ S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const { return population[iPop] + gid; @@ -111,5 +119,7 @@ S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const { template <typename DESCRIPTOR, typename S> void CyclicPopulationBuffer<DESCRIPTOR,S>::stream() { - propagate<DESCRIPTOR,S><<<1,1>>>(view(), _base.device(), _volume / sizeof(S)); + cuda::launch(propagate<DESCRIPTOR,S>, + cuda::launch_configuration_t(1,1), + view(), _base.device(), _volume / sizeof(S)); } |