From 8691f6f7306914d8fc9d5afc8a347ebf5ce0a7d7 Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Mon, 23 Aug 2021 22:04:50 +0200 Subject: Replace InitializeO by memset --- lbm.org | 113 ++++--------------------------- tangle/LLBM/kernel/collect_q_criterion.h | 2 +- tangle/LLBM/kernel/initialize.h | 44 ------------ tangle/LLBM/lattice.h | 1 - tangle/LLBM/propagate.h | 8 ++- tangle/benchmark-ldc.cu | 8 --- tangle/channel-with-sphere.cu | 7 -- tangle/ldc-2d.cu | 5 -- tangle/ldc-3d.cu | 4 -- tangle/magnus.cu | 5 -- tangle/nozzle.cu | 5 -- tangle/taylor-couette.cu | 3 - 12 files changed, 21 insertions(+), 184 deletions(-) delete mode 100644 tangle/LLBM/kernel/initialize.h diff --git a/lbm.org b/lbm.org index 07c2963..afe1d23 100644 --- a/lbm.org +++ b/lbm.org @@ -261,7 +261,7 @@ class CodeBlockPrinter(C11CodePrinter): self.custom_assignment = custom_assignment for f in custom_functions: self._kf[f] = f - + def _print_Indexed(self, expr): assert len(expr.indices) == 1 if expr.base.name[0] == 'f': @@ -552,53 +552,6 @@ f_next[7] = e11*(e14 + e2); f_next[8] = e0*(e13 + e16 + e2 + e4); #+end_example -For initialization of lattice data it often makes sense to choose values that are invariant under this -equilibrium computation. Cells initialized in this way will not change during the collision step -and as such can be considered to model a fluid /at rest/. - -#+BEGIN_SRC python :session :results none -def initialize_equilibrium(D): - return [ Assignment(IndexedBase("f_next", D.q)[i], w_i) for i, w_i in enumerate(D.w) ] -#+END_SRC - -#+NAME: initialize-populations -#+BEGIN_SRC python :session :results output -D = descriptor[lattice] -printcode(CodeBlock(*initialize_equilibrium(D))) -#+END_SRC - -#+RESULTS: initialize-populations -: f_next[0] = T{0.0277777777777778}; -: f_next[1] = T{0.111111111111111}; -: f_next[2] = T{0.0277777777777778}; -: f_next[3] = T{0.111111111111111}; -: f_next[4] = T{0.444444444444444}; -: f_next[5] = T{0.111111111111111}; -: f_next[6] = T{0.0277777777777778}; -: f_next[7] = T{0.111111111111111}; -: f_next[8] = T{0.0277777777777778}; - -#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/initialize.h -#pragma once -#include - -struct InitializeO { - -using call_tag = tag::call_by_cell_id; - -template -__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) { - <> -} - -template -__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) { - <> -} - -}; -#+END_SRC - ** BGK Collision The BGK collision operators takes a current population $f^{curr}_i$ and /relaxes/ it toward the equilibrium distribution $f^{eq}_i$ with some rate $\tau$. The result of this process is the new population $f^{next}_i$. @@ -1524,21 +1477,29 @@ mapped into this address area. #+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { // per-population handle until cuMemMap accepts non-zero offset - cuMemCreate(&_handle[iPop], _volume, &_prop, 0); + cuMemCreate(&_handle[iPop], _volume, &_prop, 0); cuMemMap(_ptr + iPop * 2 * _volume, _volume, 0, _handle[iPop], 0); cuMemMap(_ptr + iPop * 2 * _volume + _volume, _volume, 0, _handle[iPop], 0); } #+END_SRC Actually reading from and writing to locations within this memory depends on setting -the correct access flags. Once this is done we are ready to zero-initialize the buffer. +the correct access flags #+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; _access.location.id = 0; _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1); - cuMemsetD8(_ptr, 0, 2 * _volume * DESCRIPTOR::q); +#+END_SRC + +after which we are ready to initialize the buffer with lattice equilibrium values. + +#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h + for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { + float eq = descriptor::weight(iPop); + cuMemsetD32(_ptr + iPop * 2 * _volume, *reinterpret_cast(&eq), 2 * (_volume / sizeof(S))); + } #+END_SRC As the rotation of the cyclic arrays is to be realized by shifting the per-population start pointers @@ -2521,7 +2482,6 @@ various methods for applying operators and functors. #include "operator.h" #include "propagate.h" -#include "kernel/initialize.h" #include "kernel/executor.h" template @@ -3942,7 +3902,7 @@ __device__ static void apply( , std::size_t iX , std::size_t iY , std::size_t iZ - , T* cell_rho + , T* cell_rho , T* cell_u , T* cell_curl_norm , cudaSurfaceObject_t surface @@ -5116,15 +5076,6 @@ auto wall_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); #+END_SRC -All cells are initialized to the equilibrium distribution. - -#+BEGIN_SRC cpp :tangle tangle/ldc-2d.cu -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask), - Operator(InitializeO(), lid_mask)); -cudaDeviceSynchronize(); -#+END_SRC - The bulk collisions are going to use a relaxation time of \(0.51\) and the lid enacts a velocity of \(0.05\) lattice units. This maximum velocity can be used to scale the velocity norm for visualization. @@ -5228,18 +5179,13 @@ CellMaterials materials(cuboid, [&cuboid](uint3 p) -> int { }); #+END_SRC -At this point we are ready to generate masks for operator application and to -initialize all cells with their equilibrium. +At this point we are ready to generate masks for operator application. #+BEGIN_SRC cpp :tangle tangle/ldc-3d.cu auto bulk_mask = materials.mask_of_material(1); auto wall_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask), - Operator(InitializeO(), lid_mask)); - cudaDeviceSynchronize(); #+END_SRC @@ -5369,15 +5315,7 @@ auto outflow_mask = materials.mask_of_material(4); auto edge_mask = materials.mask_of_material(5); #+END_SRC -The last step prior to starting the simulation loop is to initialize -all cells with their equilibrium values. - #+BEGIN_SRC cpp :tangle tangle/magnus.cu -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask), - Operator(InitializeO(), inflow_mask), - Operator(InitializeO(), outflow_mask), - Operator(InitializeO(), edge_mask)); cudaDeviceSynchronize(); #+END_SRC @@ -5538,13 +5476,6 @@ auto inflow_mask = materials.mask_of_material(4); auto outflow_mask = materials.mask_of_material(5); auto edge_mask = materials.mask_of_material(6); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask_z), - Operator(InitializeO(), wall_mask_y), - Operator(InitializeO(), inflow_mask), - Operator(InitializeO(), outflow_mask), - Operator(InitializeO(), edge_mask)); - cudaDeviceSynchronize(); #+END_SRC @@ -5662,9 +5593,6 @@ auto bulk_list = materials.list_of_material(1); auto wall_mask = materials.mask_of_material(2); auto wall_list = materials.list_of_material(2); -lattice.apply(bulk_list); -lattice.apply(wall_list); - cudaDeviceSynchronize(); #+END_SRC @@ -5770,11 +5698,6 @@ auto boundary_mask = materials.mask_of_material(2); auto inflow_mask = materials.mask_of_material(3); auto outflow_mask = materials.mask_of_material(4); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), boundary_mask), - Operator(InitializeO(), inflow_mask), - Operator(InitializeO(), outflow_mask)); - cudaDeviceSynchronize(); #+END_SRC @@ -5895,14 +5818,6 @@ auto bulk_mask = materials.mask_of_material(1); auto box_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); -auto bulk_cells = materials.list_of_material(1); -auto box_cells = materials.list_of_material(2); -auto lid_cells = materials.list_of_material(3); - -lattice.template apply(bulk_cells); -lattice.template apply(box_cells); -lattice.template apply(lid_cells); - cudaDeviceSynchronize(); #+END_SRC diff --git a/tangle/LLBM/kernel/collect_q_criterion.h b/tangle/LLBM/kernel/collect_q_criterion.h index fa19dc7..6f770e5 100644 --- a/tangle/LLBM/kernel/collect_q_criterion.h +++ b/tangle/LLBM/kernel/collect_q_criterion.h @@ -14,7 +14,7 @@ __device__ static void apply( , std::size_t iX , std::size_t iY , std::size_t iZ - , T* cell_rho + , T* cell_rho , T* cell_u , T* cell_curl_norm , cudaSurfaceObject_t surface diff --git a/tangle/LLBM/kernel/initialize.h b/tangle/LLBM/kernel/initialize.h deleted file mode 100644 index 221b9ad..0000000 --- a/tangle/LLBM/kernel/initialize.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once -#include - -struct InitializeO { - -using call_tag = tag::call_by_cell_id; - -template -__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) { - f_next[0] = T{0.0277777777777778}; - f_next[1] = T{0.111111111111111}; - f_next[2] = T{0.0277777777777778}; - f_next[3] = T{0.111111111111111}; - f_next[4] = T{0.444444444444444}; - f_next[5] = T{0.111111111111111}; - f_next[6] = T{0.0277777777777778}; - f_next[7] = T{0.111111111111111}; - f_next[8] = T{0.0277777777777778}; -} - -template -__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) { - f_next[0] = T{0.0277777777777778}; - f_next[1] = T{0.0277777777777778}; - f_next[2] = T{0.0555555555555556}; - f_next[3] = T{0.0277777777777778}; - f_next[4] = T{0.0277777777777778}; - f_next[5] = T{0.0277777777777778}; - f_next[6] = T{0.0555555555555556}; - f_next[7] = T{0.0277777777777778}; - f_next[8] = T{0.0555555555555556}; - f_next[9] = T{0.333333333333333}; - f_next[10] = T{0.0555555555555556}; - f_next[11] = T{0.0277777777777778}; - f_next[12] = T{0.0555555555555556}; - f_next[13] = T{0.0277777777777778}; - f_next[14] = T{0.0277777777777778}; - f_next[15] = T{0.0277777777777778}; - f_next[16] = T{0.0555555555555556}; - f_next[17] = T{0.0277777777777778}; - f_next[18] = T{0.0277777777777778}; -} - -}; diff --git a/tangle/LLBM/lattice.h b/tangle/LLBM/lattice.h index 8ba1d66..7157c78 100644 --- a/tangle/LLBM/lattice.h +++ b/tangle/LLBM/lattice.h @@ -5,7 +5,6 @@ #include "operator.h" #include "propagate.h" -#include "kernel/initialize.h" #include "kernel/executor.h" template diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h index d63ccd8..acb1d6c 100644 --- a/tangle/LLBM/propagate.h +++ b/tangle/LLBM/propagate.h @@ -79,7 +79,7 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { // per-population handle until cuMemMap accepts non-zero offset - cuMemCreate(&_handle[iPop], _volume, &_prop, 0); + cuMemCreate(&_handle[iPop], _volume, &_prop, 0); cuMemMap(_ptr + iPop * 2 * _volume, _volume, 0, _handle[iPop], 0); cuMemMap(_ptr + iPop * 2 * _volume + _volume, _volume, 0, _handle[iPop], 0); } @@ -88,7 +88,11 @@ CyclicPopulationBuffer::CyclicPopulationBuffer( _access.location.id = 0; _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1); - cuMemsetD8(_ptr, 0, 2 * _volume * DESCRIPTOR::q); + + for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { + float eq = descriptor::weight(iPop); + cuMemsetD32(_ptr + iPop * 2 * _volume, *reinterpret_cast(&eq), 2 * (_volume / sizeof(S))); + } for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) { _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S)); diff --git a/tangle/benchmark-ldc.cu b/tangle/benchmark-ldc.cu index 2443afe..4de0ae5 100644 --- a/tangle/benchmark-ldc.cu +++ b/tangle/benchmark-ldc.cu @@ -30,14 +30,6 @@ void simulate(descriptor::Cuboid cuboid, std::size_t nStep) { auto box_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); - auto bulk_cells = materials.list_of_material(1); - auto box_cells = materials.list_of_material(2); - auto lid_cells = materials.list_of_material(3); - - lattice.template apply(bulk_cells); - lattice.template apply(box_cells); - lattice.template apply(lid_cells); - cudaDeviceSynchronize(); for (std::size_t iStep=0; iStep < 100; ++iStep) { diff --git a/tangle/channel-with-sphere.cu b/tangle/channel-with-sphere.cu index 401f0c9..29cc7de 100644 --- a/tangle/channel-with-sphere.cu +++ b/tangle/channel-with-sphere.cu @@ -55,13 +55,6 @@ auto inflow_mask = materials.mask_of_material(4); auto outflow_mask = materials.mask_of_material(5); auto edge_mask = materials.mask_of_material(6); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask_z), - Operator(InitializeO(), wall_mask_y), - Operator(InitializeO(), inflow_mask), - Operator(InitializeO(), outflow_mask), - Operator(InitializeO(), edge_mask)); - cudaDeviceSynchronize(); VolumetricExample renderer(cuboid); diff --git a/tangle/ldc-2d.cu b/tangle/ldc-2d.cu index 8989374..acba98f 100644 --- a/tangle/ldc-2d.cu +++ b/tangle/ldc-2d.cu @@ -32,11 +32,6 @@ auto bulk_mask = materials.mask_of_material(1); auto wall_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask), - Operator(InitializeO(), lid_mask)); -cudaDeviceSynchronize(); - const float tau = 0.51; const float u_lid = 0.05; diff --git a/tangle/ldc-3d.cu b/tangle/ldc-3d.cu index ece1234..e9b42f2 100644 --- a/tangle/ldc-3d.cu +++ b/tangle/ldc-3d.cu @@ -34,10 +34,6 @@ auto bulk_mask = materials.mask_of_material(1); auto wall_mask = materials.mask_of_material(2); auto lid_mask = materials.mask_of_material(3); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask), - Operator(InitializeO(), lid_mask)); - cudaDeviceSynchronize(); auto none = [] __device__ (float3) -> float { return 1; }; diff --git a/tangle/magnus.cu b/tangle/magnus.cu index 5800cd8..08a4515 100644 --- a/tangle/magnus.cu +++ b/tangle/magnus.cu @@ -64,11 +64,6 @@ auto inflow_mask = materials.mask_of_material(3); auto outflow_mask = materials.mask_of_material(4); auto edge_mask = materials.mask_of_material(5); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), wall_mask), - Operator(InitializeO(), inflow_mask), - Operator(InitializeO(), outflow_mask), - Operator(InitializeO(), edge_mask)); cudaDeviceSynchronize(); RenderWindow window("Magnus"); diff --git a/tangle/nozzle.cu b/tangle/nozzle.cu index 5a1b1f3..03c18f9 100644 --- a/tangle/nozzle.cu +++ b/tangle/nozzle.cu @@ -46,11 +46,6 @@ auto boundary_mask = materials.mask_of_material(2); auto inflow_mask = materials.mask_of_material(3); auto outflow_mask = materials.mask_of_material(4); -lattice.apply(Operator(InitializeO(), bulk_mask), - Operator(InitializeO(), boundary_mask), - Operator(InitializeO(), inflow_mask), - Operator(InitializeO(), outflow_mask)); - cudaDeviceSynchronize(); VolumetricExample renderer(cuboid); diff --git a/tangle/taylor-couette.cu b/tangle/taylor-couette.cu index 48b0d87..2e69bfb 100644 --- a/tangle/taylor-couette.cu +++ b/tangle/taylor-couette.cu @@ -55,9 +55,6 @@ auto bulk_list = materials.list_of_material(1); auto wall_mask = materials.mask_of_material(2); auto wall_list = materials.list_of_material(2); -lattice.apply(bulk_list); -lattice.apply(wall_list); - cudaDeviceSynchronize(); VolumetricExample renderer(cuboid); -- cgit v1.2.3