From 32dd41a728ce10113032e20955ba08f8de449857 Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Sun, 12 Sep 2021 14:01:55 +0200 Subject: Start using C++ cuda-api-wrapper instead of raw CUDA --- tangle/LLBM/kernel/executor.h | 37 ---------------------- tangle/LLBM/lattice.h | 73 ++++++++++++++++++++----------------------- tangle/LLBM/materials.h | 2 +- tangle/LLBM/memory.h | 29 +++++++++-------- tangle/LLBM/propagate.h | 32 ++++++++++++------- tangle/benchmark-ldc.cu | 12 ++++--- tangle/channel-with-sphere.cu | 8 +++-- tangle/ldc-2d.cu | 8 +++-- tangle/ldc-3d.cu | 8 +++-- tangle/magnus.cu | 10 ++++-- tangle/nozzle.cu | 8 +++-- tangle/taylor-couette.cu | 8 +++-- 12 files changed, 115 insertions(+), 120 deletions(-) (limited to 'tangle') diff --git a/tangle/LLBM/kernel/executor.h b/tangle/LLBM/kernel/executor.h index 942918d..cce023b 100644 --- a/tangle/LLBM/kernel/executor.h +++ b/tangle/LLBM/kernel/executor.h @@ -30,30 +30,6 @@ __global__ void call_operator( } } -template -__global__ void call_operator( - LatticeView lattice - , bool* mask - , ARGS... args -) { - const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x; - if (!(gid < lattice.cuboid.volume) || !mask[gid]) { - return; - } - - S f_curr[DESCRIPTOR::q]; - S f_next[DESCRIPTOR::q]; - S* preshifted_f[DESCRIPTOR::q]; - for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - preshifted_f[iPop] = lattice.pop(iPop, gid); - f_curr[iPop] = *preshifted_f[iPop]; - } - OPERATOR::template apply(DESCRIPTOR(), f_curr, f_next, gid, std::forward(args)...); - for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - *preshifted_f[iPop] = f_next[iPop]; - } -} - template __global__ void call_functor( LatticeView lattice @@ -110,19 +86,6 @@ __global__ void call_operator_using_list( OPERATOR::template apply(lattice, index, count, std::forward(args)...); } -template -__global__ void call_operator_using_list( - DESCRIPTOR descriptor - , std::size_t count - , ARGS... args -) { - const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x; - if (!(index < count)) { - return; - } - OPERATOR::template apply(descriptor, index, count, std::forward(args)...); -} - template __global__ void call_spatial_functor( LatticeView lattice diff --git a/tangle/LLBM/lattice.h b/tangle/LLBM/lattice.h index 7157c78..d3a1840 100644 --- a/tangle/LLBM/lattice.h +++ b/tangle/LLBM/lattice.h @@ -31,9 +31,10 @@ template void apply(OPERATOR... ops) { const auto block_size = 32; const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_operators<<>>( - _population.view(), ops... - ); + cuda::launch(kernel::call_operators, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + ops...); } template @@ -45,27 +46,22 @@ template void call_operator(tag::call_by_cell_id, DeviceBuffer& cells, ARGS... args) { const auto block_size = 32; const auto block_count = (cells.size() + block_size - 1) / block_size; - kernel::call_operator<<>>( - _population.view(), cells.device(), cells.size(), std::forward(args)... - ); -} - -template -void call_operator(tag::call_by_cell_id, DeviceBuffer& mask, ARGS... args) { - const auto block_size = 32; - const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_operator<<>>( - _population.view(), mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_operator, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + cells.device(), cells.size(), + std::forward(args)...); } template void call_operator(tag::call_by_list_index, std::size_t count, ARGS... args) { const auto block_size = 32; const auto block_count = (count + block_size - 1) / block_size; - kernel::call_operator_using_list<<>>( - _population.view(), count, std::forward(args)... - ); + cuda::launch(kernel::call_operator_using_list, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + count, + std::forward(args)...); } template @@ -73,22 +69,15 @@ void inspect(ARGS&&... args) { call_functor(typename FUNCTOR::call_tag{}, std::forward(args)...); } -template -void call_functor(tag::call_by_cell_id, DeviceBuffer& cells, ARGS... args) { - const auto block_size = 32; - const auto block_count = (cells.size() + block_size - 1) / block_size; - kernel::call_functor<<>>( - _population.view(), cells.device(), cells.size(), std::forward(args)... - ); -} - template void call_functor(tag::call_by_cell_id, DeviceBuffer& mask, ARGS... args) { const auto block_size = 32; const auto block_count = (_cuboid.volume + block_size - 1) / block_size; - kernel::call_functor<<>>( - _population.view(), mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_functor, + cuda::launch_configuration_t(block_count, block_size), + _population.view(), + mask.device(), + std::forward(args)...); } template @@ -97,9 +86,11 @@ void call_functor(tag::call_by_spatial_cell_mask, DeviceBuffer& mask, ARGS const dim3 grid((_cuboid.nX + block.x - 1) / block.x, (_cuboid.nY + block.y - 1) / block.y, (_cuboid.nZ + block.z - 1) / block.z); - kernel::call_spatial_functor<<>>( - _population.view(), mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_spatial_functor, + cuda::launch_configuration_t(grid, block), + _population.view(), + mask.device(), + std::forward(args)...); } template @@ -111,9 +102,11 @@ template void tagged_helper(tag::post_process_by_list_index, std::size_t count, ARGS... args) { const auto block_size = 32; const auto block_count = (count + block_size - 1) / block_size; - kernel::call_operator_using_list<<>>( - DESCRIPTOR(), count, std::forward(args)... - ); + cuda::launch(kernel::call_operator_using_list, + cuda::launch_configuration_t(block_count, block_size), + DESCRIPTOR(), + count, + std::forward(args)...); } template @@ -122,9 +115,11 @@ void tagged_helper(tag::post_process_by_spatial_cell_mask, DeviceBuffer& m const dim3 grid((_cuboid.nX + block.x - 1) / block.x, (_cuboid.nY + block.y - 1) / block.y, (_cuboid.nZ + block.z - 1) / block.z); - kernel::call_spatial_operator<<>>( - _cuboid, mask.device(), std::forward(args)... - ); + cuda::launch(kernel::call_spatial_operator, + cuda::launch_configuration_t(grid, block), + _cuboid, + mask.device(), + std::forward(args)...); } }; diff --git a/tangle/LLBM/materials.h b/tangle/LLBM/materials.h index d782d8d..5734688 100644 --- a/tangle/LLBM/materials.h +++ b/tangle/LLBM/materials.h @@ -24,7 +24,7 @@ public: descriptor::Cuboid cuboid() const { return _cuboid; }; - + int get(std::size_t iCell) const { return _materials[iCell]; } diff --git a/tangle/LLBM/memory.h b/tangle/LLBM/memory.h index 97fec5c..149034b 100644 --- a/tangle/LLBM/memory.h +++ b/tangle/LLBM/memory.h @@ -4,31 +4,30 @@ #include #include +#include + template class DeviceBuffer { protected: const std::size_t _size; - T* _data; + cuda::device_t _device; + cuda::memory::device::unique_ptr _data; public: DeviceBuffer(std::size_t size): - _size(size) { - cudaMalloc(&_data, _size*sizeof(T)); - cudaMemset(_data, 0, _size*sizeof(T)); - } + _size(size), + _device(cuda::device::current::get()), + _data(cuda::memory::device::make_unique(_device, size)) + { } DeviceBuffer(const T* data, std::size_t size): DeviceBuffer(size) { - cudaMemcpy(_data, data, size*sizeof(T), cudaMemcpyHostToDevice); + cuda::memory::copy(_data.get(), data, size*sizeof(T)); } DeviceBuffer(const std::vector& data): DeviceBuffer(data.data(), data.size()) { } - - ~DeviceBuffer() { - cudaFree(_data); - } T* device() { - return _data; + return _data.get(); } std::size_t size() const { @@ -40,7 +39,7 @@ template class SharedVector : public DeviceBuffer { private: std::unique_ptr _host_data; - + public: SharedVector(std::size_t size): DeviceBuffer(size), @@ -57,11 +56,11 @@ public: } void syncHostFromDevice() { - cudaMemcpy(_host_data.get(), this->_data, this->_size*sizeof(T), cudaMemcpyDeviceToHost); + cuda::memory::copy(_host_data.get(), this->_data.get(), this->_size*sizeof(T)); } void syncDeviceFromHost() { - cudaMemcpy(this->_data, _host_data.get(), this->_size*sizeof(T), cudaMemcpyHostToDevice); + cuda::memory::copy(this->_data.get(), _host_data.get(), this->_size*sizeof(T)); } }; @@ -103,7 +102,7 @@ public: DeviceTexture(descriptor::CuboidD<3> c): DeviceTexture(c.nX, c.nY, c.nZ) { } - + ~DeviceTexture() { cudaFreeArray(_array); } diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h index acb1d6c..91b1131 100644 --- a/tangle/LLBM/propagate.h +++ b/tangle/LLBM/propagate.h @@ -5,6 +5,7 @@ #include "kernel/propagate.h" #include +#include template struct LatticeView { @@ -37,6 +38,7 @@ protected: public: CyclicPopulationBuffer(descriptor::Cuboid cuboid); + ~CyclicPopulationBuffer(); LatticeView view() { return LatticeView{ _cuboid, _population.device() }; @@ -46,15 +48,13 @@ public: }; -std::size_t getDevicePageSize(int device_id=-1) { - if (device_id == -1) { - cudaGetDevice(&device_id); - } +std::size_t getDevicePageSize() { + auto device = cuda::device::current::get(); std::size_t granularity = 0; CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device_id; + prop.location.id = device.id(); cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); return granularity; } @@ -69,12 +69,11 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( _population(DESCRIPTOR::q) { - int device_id = -1; - cudaGetDevice(&device_id); + auto device = cuda::device::current::get(); _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _prop.location.id = device_id; + _prop.location.id = device.id(); cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0); for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { @@ -85,7 +84,7 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( } _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _access.location.id = 0; + _access.location.id = device.id(); _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1); @@ -95,7 +94,7 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( } for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S)); + _base[iPop] = this->device() + iPop * 2 * (_volume / sizeof(S)); _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q); } @@ -103,6 +102,15 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( _population.syncDeviceFromHost(); } +template +CyclicPopulationBuffer::~CyclicPopulationBuffer() { + cuMemUnmap(_ptr, 2 * _volume * DESCRIPTOR::q); + for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { + cuMemRelease(_handle[iPop]); + } + cuMemAddressFree(_ptr, 2 * _volume * DESCRIPTOR::q); +} + template __device__ __forceinline__ S* LatticeView::pop(pop_index_t iPop, std::size_t gid) const { @@ -111,5 +119,7 @@ S* LatticeView::pop(pop_index_t iPop, std::size_t gid) const { template void CyclicPopulationBuffer::stream() { - propagate<<<1,1>>>(view(), _base.device(), _volume / sizeof(S)); + cuda::launch(propagate, + cuda::launch_configuration_t(1,1), + view(), _base.device(), _volume / sizeof(S)); } diff --git a/tangle/benchmark-ldc.cu b/tangle/benchmark-ldc.cu index 1400547..18b30d1 100644 --- a/tangle/benchmark-ldc.cu +++ b/tangle/benchmark-ldc.cu @@ -12,7 +12,7 @@ using T = float; using DESCRIPTOR = descriptor::D3Q19; void simulate(descriptor::Cuboid cuboid, std::size_t nStep) { - cudaSetDevice(0); + auto current = cuda::device::current::get(); Lattice lattice(cuboid); @@ -30,7 +30,7 @@ void simulate(descriptor::Cuboid cuboid, std::size_t nStep) { auto box_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); - cudaDeviceSynchronize(); + cuda::synchronize(current); for (std::size_t iStep=0; iStep < 100; ++iStep) { lattice.apply(Operator(BgkCollideO(), bulk_mask, 0.56), @@ -39,7 +39,7 @@ void simulate(descriptor::Cuboid cuboid, std::size_t nStep) { lattice.stream(); } - cudaDeviceSynchronize(); + cuda::synchronize(current); auto start = timer::now(); @@ -50,7 +50,7 @@ void simulate(descriptor::Cuboid cuboid, std::size_t nStep) { lattice.stream(); } - cudaDeviceSynchronize(); + cuda::synchronize(current); auto mlups = timer::mlups(cuboid.volume, nStep, start); @@ -58,6 +58,10 @@ void simulate(descriptor::Cuboid cuboid, std::size_t nStep) { } int main(int argc, char* argv[]) { + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; + } if (argc != 3) { std::cerr << "Invalid parameter count" << std::endl; return -1; diff --git a/tangle/channel-with-sphere.cu b/tangle/channel-with-sphere.cu index 29cc7de..280a142 100644 --- a/tangle/channel-with-sphere.cu +++ b/tangle/channel-with-sphere.cu @@ -15,7 +15,11 @@ using T = float; using DESCRIPTOR = descriptor::D3Q19; int main() { -cudaSetDevice(0); +if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; +} +auto current = cuda::device::current::get(); const descriptor::Cuboid cuboid(448, 64, 64); Lattice lattice(cuboid); @@ -55,7 +59,7 @@ auto inflow_mask = materials.mask_of_material(4); auto outflow_mask = materials.mask_of_material(5); auto edge_mask = materials.mask_of_material(6); -cudaDeviceSynchronize(); +cuda::synchronize(current); VolumetricExample renderer(cuboid); renderer.add(lattice, bulk_mask, obstacle); diff --git a/tangle/ldc-2d.cu b/tangle/ldc-2d.cu index acba98f..5fa36f9 100644 --- a/tangle/ldc-2d.cu +++ b/tangle/ldc-2d.cu @@ -13,7 +13,11 @@ using T = float; using DESCRIPTOR = descriptor::D2Q9; int main() { -cudaSetDevice(0); +if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; +} +auto current = cuda::device::current::get(); const descriptor::Cuboid cuboid(500, 500); Lattice lattice(cuboid); @@ -52,7 +56,7 @@ while (window.isOpen()) { Operator(BounceBackMovingWallO(), lid_mask, std::min(iStep*1e-3, 1.0)*u_lid, 0.f)); lattice.stream(); if (iStep % 100 == 0) { - cudaDeviceSynchronize(); + cuda::synchronize(current); lattice.inspect(bulk_mask, moments_rho.device(), moments_u.device()); renderSliceViewToTexture<<< dim3(cuboid.nX / 32 + 1, cuboid.nY / 32 + 1), diff --git a/tangle/ldc-3d.cu b/tangle/ldc-3d.cu index e9b42f2..c70b12e 100644 --- a/tangle/ldc-3d.cu +++ b/tangle/ldc-3d.cu @@ -15,7 +15,11 @@ using T = float; using DESCRIPTOR = descriptor::D3Q19; int main() { -cudaSetDevice(0); +if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; +} +auto current = cuda::device::current::get(); const descriptor::Cuboid cuboid(100, 100, 100); Lattice lattice(cuboid); @@ -34,7 +38,7 @@ auto bulk_mask = materials.mask_of_material(1); auto wall_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); -cudaDeviceSynchronize(); +cuda::synchronize(current); auto none = [] __device__ (float3) -> float { return 1; }; VolumetricExample renderer(cuboid); diff --git a/tangle/magnus.cu b/tangle/magnus.cu index 08a4515..a98bcf3 100644 --- a/tangle/magnus.cu +++ b/tangle/magnus.cu @@ -13,7 +13,11 @@ using T = float; using DESCRIPTOR = descriptor::D2Q9; int main() { -cudaSetDevice(0); +if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; +} +auto current = cuda::device::current::get(); const descriptor::Cuboid cuboid(1200, 500); Lattice lattice(cuboid); @@ -64,7 +68,7 @@ auto inflow_mask = materials.mask_of_material(3); auto outflow_mask = materials.mask_of_material(4); auto edge_mask = materials.mask_of_material(5); -cudaDeviceSynchronize(); +cuda::synchronize(current); RenderWindow window("Magnus"); cudaSurfaceObject_t colormap; @@ -83,7 +87,7 @@ while (window.isOpen()) { lattice.apply(bouzidi.getCount(), bouzidi.getConfig()); lattice.stream(); if (iStep % 200 == 0) { - cudaDeviceSynchronize(); + cuda::synchronize(current); lattice.inspect(bulk_mask, moments_rho.device(), moments_u.device()); renderSliceViewToTexture<<< dim3(cuboid.nX / 32 + 1, cuboid.nY / 32 + 1), diff --git a/tangle/nozzle.cu b/tangle/nozzle.cu index 03c18f9..5278096 100644 --- a/tangle/nozzle.cu +++ b/tangle/nozzle.cu @@ -15,7 +15,11 @@ using T = float; using DESCRIPTOR = descriptor::D3Q19; int main() { -cudaSetDevice(0); +if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; +} +auto current = cuda::device::current::get(); const descriptor::Cuboid cuboid(500, 80, 80); Lattice lattice(cuboid); @@ -46,7 +50,7 @@ auto boundary_mask = materials.mask_of_material(2); auto inflow_mask = materials.mask_of_material(3); auto outflow_mask = materials.mask_of_material(4); -cudaDeviceSynchronize(); +cuda::synchronize(current); VolumetricExample renderer(cuboid); renderer.add(lattice, bulk_mask, obstacle); diff --git a/tangle/taylor-couette.cu b/tangle/taylor-couette.cu index 2e69bfb..6b39afa 100644 --- a/tangle/taylor-couette.cu +++ b/tangle/taylor-couette.cu @@ -15,7 +15,11 @@ using T = float; using DESCRIPTOR = descriptor::D3Q19; int main() { -cudaSetDevice(0); +if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << std::endl; + return -1; +} +auto current = cuda::device::current::get(); const descriptor::Cuboid cuboid(500, 96, 96); Lattice lattice(cuboid); @@ -55,7 +59,7 @@ auto bulk_list = materials.list_of_material(1); auto wall_mask = materials.mask_of_material(2); auto wall_list = materials.list_of_material(2); -cudaDeviceSynchronize(); +cuda::synchronize(current); VolumetricExample renderer(cuboid); renderer.add(lattice, bulk_mask, inner_cylinder); -- cgit v1.2.3