diff options
Diffstat (limited to 'tangle/LLBM/propagate.h')
-rw-r--r-- | tangle/LLBM/propagate.h | 32 |
1 files changed, 21 insertions, 11 deletions
diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h index acb1d6c..91b1131 100644 --- a/tangle/LLBM/propagate.h +++ b/tangle/LLBM/propagate.h @@ -5,6 +5,7 @@ #include "kernel/propagate.h" #include <cuda.h> +#include <cuda/runtime_api.hpp> template <typename DESCRIPTOR, typename S> struct LatticeView { @@ -37,6 +38,7 @@ protected: public: CyclicPopulationBuffer(descriptor::Cuboid<DESCRIPTOR> cuboid); + ~CyclicPopulationBuffer(); LatticeView<DESCRIPTOR,S> view() { return LatticeView<DESCRIPTOR,S>{ _cuboid, _population.device() }; @@ -46,15 +48,13 @@ public: }; -std::size_t getDevicePageSize(int device_id=-1) { - if (device_id == -1) { - cudaGetDevice(&device_id); - } +std::size_t getDevicePageSize() { + auto device = cuda::device::current::get(); std::size_t granularity = 0; CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device_id; + prop.location.id = device.id(); cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); return granularity; } @@ -69,12 +69,11 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( _population(DESCRIPTOR::q) { - int device_id = -1; - cudaGetDevice(&device_id); + auto device = cuda::device::current::get(); _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _prop.location.id = device_id; + _prop.location.id = device.id(); cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0); for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { @@ -85,7 +84,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( } _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - _access.location.id = 0; + _access.location.id = device.id(); _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1); @@ -95,7 +94,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( } for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { - _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S)); + _base[iPop] = this->device() + iPop * 2 * (_volume / sizeof(S)); _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q); } @@ -104,6 +103,15 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer( } template <typename DESCRIPTOR, typename S> +CyclicPopulationBuffer<DESCRIPTOR,S>::~CyclicPopulationBuffer() { + cuMemUnmap(_ptr, 2 * _volume * DESCRIPTOR::q); + for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { + cuMemRelease(_handle[iPop]); + } + cuMemAddressFree(_ptr, 2 * _volume * DESCRIPTOR::q); +} + +template <typename DESCRIPTOR, typename S> __device__ __forceinline__ S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const { return population[iPop] + gid; @@ -111,5 +119,7 @@ S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const { template <typename DESCRIPTOR, typename S> void CyclicPopulationBuffer<DESCRIPTOR,S>::stream() { - propagate<DESCRIPTOR,S><<<1,1>>>(view(), _base.device(), _volume / sizeof(S)); + cuda::launch(propagate<DESCRIPTOR,S>, + cuda::launch_configuration_t(1,1), + view(), _base.device(), _volume / sizeof(S)); } |