From 32dd41a728ce10113032e20955ba08f8de449857 Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Sun, 12 Sep 2021 14:01:55 +0200
Subject: Start using C++ cuda-api-wrapper instead of raw CUDA

---
 tangle/LLBM/kernel/executor.h | 37 ----------------------
 tangle/LLBM/lattice.h         | 73 ++++++++++++++++++++-----------------------
 tangle/LLBM/materials.h       |  2 +-
 tangle/LLBM/memory.h          | 29 +++++++++--------
 tangle/LLBM/propagate.h       | 32 ++++++++++++-------
 5 files changed, 70 insertions(+), 103 deletions(-)

(limited to 'tangle/LLBM')
diff --git a/tangle/LLBM/kernel/executor.h b/tangle/LLBM/kernel/executor.h
index 942918d..cce023b 100644
--- a/tangle/LLBM/kernel/executor.h
+++ b/tangle/LLBM/kernel/executor.h
@@ -30,30 +30,6 @@ __global__ void call_operator(
   }
 }
 
-template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
-__global__ void call_operator(
-    LatticeView<DESCRIPTOR,S> lattice
-  , bool* mask
-  , ARGS... args
-) {
-  const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (!(gid < lattice.cuboid.volume) || !mask[gid]) {
-      return;
-  }
-
-  S f_curr[DESCRIPTOR::q];
-  S f_next[DESCRIPTOR::q];
-  S* preshifted_f[DESCRIPTOR::q];
-  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
-    preshifted_f[iPop] = lattice.pop(iPop, gid);
-    f_curr[iPop] = *preshifted_f[iPop];
-  }
-  OPERATOR::template apply<T,S>(DESCRIPTOR(), f_curr, f_next, gid, std::forward<ARGS>(args)...);
-  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
-    *preshifted_f[iPop] = f_next[iPop];
-  }
-}
-
 template <typename FUNCTOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
 __global__ void call_functor(
     LatticeView<DESCRIPTOR,S> lattice
@@ -110,19 +86,6 @@ __global__ void call_operator_using_list(
   OPERATOR::template apply<T,S>(lattice, index, count, std::forward<ARGS>(args)...);
 }
 
-template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
-__global__ void call_operator_using_list(
-    DESCRIPTOR descriptor
-  , std::size_t count
-  , ARGS... args
-) {
-  const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (!(index < count)) {
-      return;
-  }
-  OPERATOR::template apply<T,S>(descriptor, index, count, std::forward<ARGS>(args)...);
-}
-
 template <typename FUNCTOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
 __global__ void call_spatial_functor(
     LatticeView<DESCRIPTOR,S> lattice
diff --git a/tangle/LLBM/lattice.h b/tangle/LLBM/lattice.h
index 7157c78..d3a1840 100644
--- a/tangle/LLBM/lattice.h
+++ b/tangle/LLBM/lattice.h
@@ -31,9 +31,10 @@ template <typename... OPERATOR>
 void apply(OPERATOR... ops) {
   const auto block_size = 32;
   const auto block_count = (_cuboid.volume + block_size - 1) / block_size;
-  kernel::call_operators<DESCRIPTOR,T,S,OPERATOR...><<<block_count,block_size>>>(
-    _population.view(), ops...
-  );
+  cuda::launch(kernel::call_operators<DESCRIPTOR,T,S,OPERATOR...>,
+               cuda::launch_configuration_t(block_count, block_size),
+               _population.view(),
+               ops...);
 }
 
 template <typename OPERATOR, typename... ARGS>
@@ -45,27 +46,22 @@ template <typename OPERATOR, typename... ARGS>
 void call_operator(tag::call_by_cell_id, DeviceBuffer<std::size_t>& cells, ARGS... args) {
   const auto block_size = 32;
   const auto block_count = (cells.size() + block_size - 1) / block_size;
-  kernel::call_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>(
-    _population.view(), cells.device(), cells.size(), std::forward<ARGS>(args)...
-  );
-}
-
-template <typename OPERATOR, typename... ARGS>
-void call_operator(tag::call_by_cell_id, DeviceBuffer<bool>& mask, ARGS... args) {
-  const auto block_size = 32;
-  const auto block_count = (_cuboid.volume + block_size - 1) / block_size;
-  kernel::call_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>(
-    _population.view(), mask.device(), std::forward<ARGS>(args)...
-  );
+  cuda::launch(kernel::call_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...>,
+               cuda::launch_configuration_t(block_count, block_size),
+               _population.view(),
+               cells.device(), cells.size(),
+               std::forward<ARGS>(args)...);
 }
 
 template <typename OPERATOR, typename... ARGS>
 void call_operator(tag::call_by_list_index, std::size_t count, ARGS... args) {
   const auto block_size = 32;
   const auto block_count = (count + block_size - 1) / block_size;
-  kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>(
-    _population.view(), count, std::forward<ARGS>(args)...
-  );
+  cuda::launch(kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...>,
+               cuda::launch_configuration_t(block_count, block_size),
+               _population.view(),
+               count,
+               std::forward<ARGS>(args)...);
 }
 
 template <typename FUNCTOR, typename... ARGS>
@@ -73,22 +69,15 @@ void inspect(ARGS&&... args) {
   call_functor<FUNCTOR>(typename FUNCTOR::call_tag{}, std::forward<ARGS&&>(args)...);
 }
 
-template <typename FUNCTOR, typename... ARGS>
-void call_functor(tag::call_by_cell_id, DeviceBuffer<std::size_t>& cells, ARGS... args) {
-  const auto block_size = 32;
-  const auto block_count = (cells.size() + block_size - 1) / block_size;
-  kernel::call_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>(
-    _population.view(), cells.device(), cells.size(), std::forward<ARGS>(args)...
-  );
-}
-
 template <typename FUNCTOR, typename... ARGS>
 void call_functor(tag::call_by_cell_id, DeviceBuffer<bool>& mask, ARGS... args) {
   const auto block_size = 32;
   const auto block_count = (_cuboid.volume + block_size - 1) / block_size;
-  kernel::call_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>(
-    _population.view(), mask.device(), std::forward<ARGS>(args)...
-  );
+  cuda::launch(kernel::call_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...>,
+               cuda::launch_configuration_t(block_count, block_size),
+               _population.view(),
+               mask.device(),
+               std::forward<ARGS>(args)...);
 }
 
 template <typename FUNCTOR, typename... ARGS>
@@ -97,9 +86,11 @@ void call_functor(tag::call_by_spatial_cell_mask, DeviceBuffer<bool>& mask, ARGS
   const dim3 grid((_cuboid.nX + block.x - 1) / block.x,
                   (_cuboid.nY + block.y - 1) / block.y,
                   (_cuboid.nZ + block.z - 1) / block.z);
-  kernel::call_spatial_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...><<<grid,block>>>(
-    _population.view(), mask.device(), std::forward<ARGS>(args)...
-  );
+  cuda::launch(kernel::call_spatial_functor<FUNCTOR,DESCRIPTOR,T,S,ARGS...>,
+               cuda::launch_configuration_t(grid, block),
+               _population.view(),
+               mask.device(),
+               std::forward<ARGS>(args)...);
 }
 
 template <typename OPERATOR, typename... ARGS>
@@ -111,9 +102,11 @@ template <typename OPERATOR, typename... ARGS>
 void tagged_helper(tag::post_process_by_list_index, std::size_t count, ARGS... args) {
   const auto block_size = 32;
   const auto block_count = (count + block_size - 1) / block_size;
-  kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<block_count,block_size>>>(
-    DESCRIPTOR(), count, std::forward<ARGS>(args)...
-  );
+  cuda::launch(kernel::call_operator_using_list<OPERATOR,DESCRIPTOR,T,S,ARGS...>,
+               cuda::launch_configuration_t(block_count, block_size),
+               DESCRIPTOR(),
+               count,
+               std::forward<ARGS>(args)...);
 }
 
 template <typename OPERATOR, typename... ARGS>
@@ -122,9 +115,11 @@ void tagged_helper(tag::post_process_by_spatial_cell_mask, DeviceBuffer<bool>& m
   const dim3 grid((_cuboid.nX + block.x - 1) / block.x,
                   (_cuboid.nY + block.y - 1) / block.y,
                   (_cuboid.nZ + block.z - 1) / block.z);
-  kernel::call_spatial_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...><<<grid,block>>>(
-    _cuboid, mask.device(), std::forward<ARGS>(args)...
-  );
+  cuda::launch(kernel::call_spatial_operator<OPERATOR,DESCRIPTOR,T,S,ARGS...>,
+               cuda::launch_configuration_t(grid, block),
+               _cuboid,
+               mask.device(),
+               std::forward<ARGS>(args)...);
 }
 
 };
diff --git a/tangle/LLBM/materials.h b/tangle/LLBM/materials.h
index d782d8d..5734688 100644
--- a/tangle/LLBM/materials.h
+++ b/tangle/LLBM/materials.h
@@ -24,7 +24,7 @@ public:
   descriptor::Cuboid<DESCRIPTOR> cuboid() const {
     return _cuboid;
   };
- 
+
   int get(std::size_t iCell) const {
     return _materials[iCell];
   }
diff --git a/tangle/LLBM/memory.h b/tangle/LLBM/memory.h
index 97fec5c..149034b 100644
--- a/tangle/LLBM/memory.h
+++ b/tangle/LLBM/memory.h
@@ -4,31 +4,30 @@
 #include <vector>
 #include <cstring>
 
+#include <cuda/runtime_api.hpp>
+
 template <typename T>
 class DeviceBuffer {
 protected:
   const std::size_t _size;
-  T* _data;
+  cuda::device_t _device;
+  cuda::memory::device::unique_ptr<T[]> _data;
 
 public:
   DeviceBuffer(std::size_t size):
-    _size(size) {
-    cudaMalloc(&_data, _size*sizeof(T));
-    cudaMemset(_data, 0, _size*sizeof(T));
-  }
+    _size(size),
+    _device(cuda::device::current::get()),
+    _data(cuda::memory::device::make_unique<T[]>(_device, size))
+  { }
   DeviceBuffer(const T* data, std::size_t size):
     DeviceBuffer(size) {
-    cudaMemcpy(_data, data, size*sizeof(T), cudaMemcpyHostToDevice);
+    cuda::memory::copy(_data.get(), data, size*sizeof(T));
   }
   DeviceBuffer(const std::vector<T>& data):
     DeviceBuffer(data.data(), data.size()) { }
- 
-  ~DeviceBuffer() {
-    cudaFree(_data);
-  }
 
   T* device() {
-    return _data;
+    return _data.get();
   }
 
   std::size_t size() const {
@@ -40,7 +39,7 @@ template <typename T>
 class SharedVector : public DeviceBuffer<T> {
 private:
   std::unique_ptr<T[]> _host_data;
- 
+
 public:
   SharedVector(std::size_t size):
     DeviceBuffer<T>(size),
@@ -57,11 +56,11 @@ public:
   }
 
   void syncHostFromDevice() {
-    cudaMemcpy(_host_data.get(), this->_data, this->_size*sizeof(T), cudaMemcpyDeviceToHost);
+    cuda::memory::copy(_host_data.get(), this->_data.get(), this->_size*sizeof(T));
   }
 
   void syncDeviceFromHost() {
-    cudaMemcpy(this->_data, _host_data.get(), this->_size*sizeof(T), cudaMemcpyHostToDevice);
+    cuda::memory::copy(this->_data.get(), _host_data.get(), this->_size*sizeof(T));
   }
 
 };
@@ -103,7 +102,7 @@ public:
 
   DeviceTexture(descriptor::CuboidD<3> c):
     DeviceTexture(c.nX, c.nY, c.nZ) { }
-  
+
   ~DeviceTexture() {
     cudaFreeArray(_array);
   }
diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h
index acb1d6c..91b1131 100644
--- a/tangle/LLBM/propagate.h
+++ b/tangle/LLBM/propagate.h
@@ -5,6 +5,7 @@
 #include "kernel/propagate.h"
 
 #include <cuda.h>
+#include <cuda/runtime_api.hpp>
 
 template <typename DESCRIPTOR, typename S>
 struct LatticeView {
@@ -37,6 +38,7 @@ protected:
 
 public:
   CyclicPopulationBuffer(descriptor::Cuboid<DESCRIPTOR> cuboid);
+  ~CyclicPopulationBuffer();
 
   LatticeView<DESCRIPTOR,S> view() {
     return LatticeView<DESCRIPTOR,S>{ _cuboid, _population.device() };
@@ -46,15 +48,13 @@ public:
 
 };
 
-std::size_t getDevicePageSize(int device_id=-1) {
-  if (device_id == -1) {
-    cudaGetDevice(&device_id);
-  }
+std::size_t getDevicePageSize() {
+  auto device = cuda::device::current::get();
   std::size_t granularity = 0;
   CUmemAllocationProp prop = {};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = device_id;
+  prop.location.id = device.id();
   cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
   return granularity;
 }
@@ -69,12 +69,11 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
   _population(DESCRIPTOR::q)
 {
 
-  int device_id = -1;
-  cudaGetDevice(&device_id);
+  auto device = cuda::device::current::get();
 
   _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  _prop.location.id = device_id;
+  _prop.location.id = device.id();
   cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0);
 
   for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
@@ -85,7 +84,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
   }
 
   _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  _access.location.id = 0;
+  _access.location.id = device.id();
   _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1);
 
@@ -95,7 +94,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
   }
 
   for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
-    _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S));
+    _base[iPop] = this->device() + iPop * 2 * (_volume / sizeof(S));
     _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q);
   }
 
@@ -103,6 +102,15 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
   _population.syncDeviceFromHost();
 }
 
+template <typename DESCRIPTOR, typename S>
+CyclicPopulationBuffer<DESCRIPTOR,S>::~CyclicPopulationBuffer() {
+  cuMemUnmap(_ptr, 2 * _volume * DESCRIPTOR::q);
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    cuMemRelease(_handle[iPop]);
+  }
+  cuMemAddressFree(_ptr, 2 * _volume * DESCRIPTOR::q);
+}
+
 template <typename DESCRIPTOR, typename S>
 __device__ __forceinline__
 S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const {
@@ -111,5 +119,7 @@ S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const {
 
 template <typename DESCRIPTOR, typename S>
 void CyclicPopulationBuffer<DESCRIPTOR,S>::stream() {
-  propagate<DESCRIPTOR,S><<<1,1>>>(view(), _base.device(), _volume / sizeof(S));
+  cuda::launch(propagate<DESCRIPTOR,S>,
+               cuda::launch_configuration_t(1,1),
+               view(), _base.device(), _volume / sizeof(S));
 }
-- 
cgit v1.2.3