summaryrefslogtreecommitdiff
path: root/tangle/LLBM/propagate.h
diff options
context:
space:
mode:
Diffstat (limited to 'tangle/LLBM/propagate.h')
-rw-r--r--tangle/LLBM/propagate.h32
1 files changed, 21 insertions, 11 deletions
diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h
index acb1d6c..91b1131 100644
--- a/tangle/LLBM/propagate.h
+++ b/tangle/LLBM/propagate.h
@@ -5,6 +5,7 @@
#include "kernel/propagate.h"
#include <cuda.h>
+#include <cuda/runtime_api.hpp>
template <typename DESCRIPTOR, typename S>
struct LatticeView {
@@ -37,6 +38,7 @@ protected:
public:
CyclicPopulationBuffer(descriptor::Cuboid<DESCRIPTOR> cuboid);
+ ~CyclicPopulationBuffer();
LatticeView<DESCRIPTOR,S> view() {
return LatticeView<DESCRIPTOR,S>{ _cuboid, _population.device() };
@@ -46,15 +48,13 @@ public:
};
-std::size_t getDevicePageSize(int device_id=-1) {
- if (device_id == -1) {
- cudaGetDevice(&device_id);
- }
+std::size_t getDevicePageSize() {
+ auto device = cuda::device::current::get();
std::size_t granularity = 0;
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
- prop.location.id = device_id;
+ prop.location.id = device.id();
cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
return granularity;
}
@@ -69,12 +69,11 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
_population(DESCRIPTOR::q)
{
- int device_id = -1;
- cudaGetDevice(&device_id);
+ auto device = cuda::device::current::get();
_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
- _prop.location.id = device_id;
+ _prop.location.id = device.id();
cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0);
for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
@@ -85,7 +84,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
}
_access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
- _access.location.id = 0;
+ _access.location.id = device.id();
_access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1);
@@ -95,7 +94,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
}
for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
- _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S));
+ _base[iPop] = this->device() + iPop * 2 * (_volume / sizeof(S));
_population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q);
}
@@ -104,6 +103,15 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
}
template <typename DESCRIPTOR, typename S>
+CyclicPopulationBuffer<DESCRIPTOR,S>::~CyclicPopulationBuffer() {
+ cuMemUnmap(_ptr, 2 * _volume * DESCRIPTOR::q);
+ for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+ cuMemRelease(_handle[iPop]);
+ }
+ cuMemAddressFree(_ptr, 2 * _volume * DESCRIPTOR::q);
+}
+
+template <typename DESCRIPTOR, typename S>
__device__ __forceinline__
S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const {
return population[iPop] + gid;
@@ -111,5 +119,7 @@ S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const {
template <typename DESCRIPTOR, typename S>
void CyclicPopulationBuffer<DESCRIPTOR,S>::stream() {
- propagate<DESCRIPTOR,S><<<1,1>>>(view(), _base.device(), _volume / sizeof(S));
+ cuda::launch(propagate<DESCRIPTOR,S>,
+ cuda::launch_configuration_t(1,1),
+ view(), _base.device(), _volume / sizeof(S));
}