tangle/LLBM/propagate.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

#pragma once

#include "memory.h"
#include "descriptor.h"
#include "kernel/propagate.h"

#include <cuda.h>

template <typename DESCRIPTOR, typename S>
struct LatticeView {
  const descriptor::Cuboid<DESCRIPTOR> cuboid;
  S** population;

  __device__ __forceinline__
  S* pop(pop_index_t iPop, std::size_t gid) const;
};

template <typename DESCRIPTOR, typename S>
class CyclicPopulationBuffer {
protected:
  const descriptor::Cuboid<DESCRIPTOR> _cuboid;

  const std::size_t _page_size;
  const std::size_t _volume;

  CUmemGenericAllocationHandle _handle[DESCRIPTOR::q];
  CUmemAllocationProp _prop{};
  CUmemAccessDesc _access{};
  CUdeviceptr _ptr;

  SharedVector<S*> _base;
  SharedVector<S*> _population;

  S* device() {
    return reinterpret_cast<S*>(_ptr);
  }

public:
  CyclicPopulationBuffer(descriptor::Cuboid<DESCRIPTOR> cuboid);

  LatticeView<DESCRIPTOR,S> view() {
    return LatticeView<DESCRIPTOR,S>{ _cuboid, _population.device() };
  }

  void stream();

};

std::size_t getDevicePageSize(int device_id=-1) {
  if (device_id == -1) {
    cudaGetDevice(&device_id);
  }
  std::size_t granularity = 0;
  CUmemAllocationProp prop = {};
  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  prop.location.id = device_id;
  cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
  return granularity;
}

template <typename DESCRIPTOR, typename S>
CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
  descriptor::Cuboid<DESCRIPTOR> cuboid):
  _cuboid(cuboid),
  _page_size{getDevicePageSize()},
  _volume{((cuboid.volume * sizeof(S) - 1) / _page_size + 1) * _page_size},
  _base(DESCRIPTOR::q),
  _population(DESCRIPTOR::q)
{

  int device_id = -1;
  cudaGetDevice(&device_id);

  _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  _prop.location.id = device_id;
  cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0);

  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
    // per-population handle until cuMemMap accepts non-zero offset
    cuMemCreate(&_handle[iPop], _volume, &_prop, 0); 
    cuMemMap(_ptr + iPop * 2 * _volume,           _volume, 0, _handle[iPop], 0);
    cuMemMap(_ptr + iPop * 2 * _volume + _volume, _volume, 0, _handle[iPop], 0);
  }

  _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  _access.location.id = 0;
  _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
  cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1);
  cuMemsetD8(_ptr, 0, 2 * _volume * DESCRIPTOR::q);

  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
    _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S));
    _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q);
  }

  _base.syncDeviceFromHost();
  _population.syncDeviceFromHost();
}

template <typename DESCRIPTOR, typename S>
__device__ __forceinline__
S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const {
  return population[iPop] + gid;
}

template <typename DESCRIPTOR, typename S>
void CyclicPopulationBuffer<DESCRIPTOR,S>::stream() {
  propagate<DESCRIPTOR,S><<<1,1>>>(view(), _base.device(), _volume / sizeof(S));
}