From 8691f6f7306914d8fc9d5afc8a347ebf5ce0a7d7 Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Mon, 23 Aug 2021 22:04:50 +0200
Subject: Replace InitializeO by memset

---
 lbm.org                                  | 113 ++++---------------------------
 tangle/LLBM/kernel/collect_q_criterion.h |   2 +-
 tangle/LLBM/kernel/initialize.h          |  44 ------------
 tangle/LLBM/lattice.h                    |   1 -
 tangle/LLBM/propagate.h                  |   8 ++-
 tangle/benchmark-ldc.cu                  |   8 ---
 tangle/channel-with-sphere.cu            |   7 --
 tangle/ldc-2d.cu                         |   5 --
 tangle/ldc-3d.cu                         |   4 --
 tangle/magnus.cu                         |   5 --
 tangle/nozzle.cu                         |   5 --
 tangle/taylor-couette.cu                 |   3 -
 12 files changed, 21 insertions(+), 184 deletions(-)
 delete mode 100644 tangle/LLBM/kernel/initialize.h
diff --git a/lbm.org b/lbm.org
index 07c2963..afe1d23 100644
--- a/lbm.org
+++ b/lbm.org
@@ -261,7 +261,7 @@ class CodeBlockPrinter(C11CodePrinter):
         self.custom_assignment = custom_assignment
         for f in custom_functions:
             self._kf[f] = f
-     
+    
     def _print_Indexed(self, expr):
         assert len(expr.indices) == 1
         if expr.base.name[0] == 'f':
@@ -552,53 +552,6 @@ f_next[7] = e11*(e14 + e2);
 f_next[8] = e0*(e13 + e16 + e2 + e4);
 #+end_example
 
-For initialization of lattice data it often makes sense to choose values that are invariant under this
-equilibrium computation. Cells initialized in this way will not change during the collision step
-and as such can be considered to model a fluid /at rest/.
-
-#+BEGIN_SRC python :session :results none
-def initialize_equilibrium(D):
-    return [ Assignment(IndexedBase("f_next", D.q)[i], w_i) for i, w_i in enumerate(D.w) ]
-#+END_SRC
-
-#+NAME: initialize-populations
-#+BEGIN_SRC python :session :results output
-D = descriptor[lattice]
-printcode(CodeBlock(*initialize_equilibrium(D)))
-#+END_SRC
-
-#+RESULTS: initialize-populations
-: f_next[0] = T{0.0277777777777778};
-: f_next[1] = T{0.111111111111111};
-: f_next[2] = T{0.0277777777777778};
-: f_next[3] = T{0.111111111111111};
-: f_next[4] = T{0.444444444444444};
-: f_next[5] = T{0.111111111111111};
-: f_next[6] = T{0.0277777777777778};
-: f_next[7] = T{0.111111111111111};
-: f_next[8] = T{0.0277777777777778};
-
-#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/initialize.h
-#pragma once
-#include <LLBM/call_tag.h>
-
-struct InitializeO {
-
-using call_tag = tag::call_by_cell_id;
-
-template <typename T, typename S>
-__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) {
-  <<initialize-populations(lattice="D2Q9")>>
-}
-
-template <typename T, typename S>
-__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) {
-  <<initialize-populations(lattice="D3Q19")>>
-}
-
-};
-#+END_SRC
-
 ** BGK Collision
 The BGK collision operators takes a current population $f^{curr}_i$ and /relaxes/ it toward the equilibrium distribution
 $f^{eq}_i$ with some rate $\tau$. The result of this process is the new population $f^{next}_i$.
@@ -1524,21 +1477,29 @@ mapped into this address area.
 #+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
   for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
     // per-population handle until cuMemMap accepts non-zero offset
-    cuMemCreate(&_handle[iPop], _volume, &_prop, 0); 
+    cuMemCreate(&_handle[iPop], _volume, &_prop, 0);
     cuMemMap(_ptr + iPop * 2 * _volume,           _volume, 0, _handle[iPop], 0);
     cuMemMap(_ptr + iPop * 2 * _volume + _volume, _volume, 0, _handle[iPop], 0);
   }
 #+END_SRC
 
 Actually reading from and writing to locations within this memory depends on setting
-the correct access flags. Once this is done we are ready to zero-initialize the buffer.
+the correct access flags
 
 #+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
   _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   _access.location.id = 0;
   _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1);
-  cuMemsetD8(_ptr, 0, 2 * _volume * DESCRIPTOR::q);
+#+END_SRC
+
+after which we are ready to initialize the buffer with lattice equilibrium values.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    float eq = descriptor::weight<DESCRIPTOR>(iPop);
+    cuMemsetD32(_ptr + iPop * 2 * _volume, *reinterpret_cast<int*>(&eq), 2 * (_volume / sizeof(S)));
+  }
 #+END_SRC
 
 As the rotation of the cyclic arrays is to be realized by shifting the per-population start pointers
@@ -2521,7 +2482,6 @@ various methods for applying operators and functors.
 #include "operator.h"
 
 #include "propagate.h"
-#include "kernel/initialize.h"
 #include "kernel/executor.h"
 
 template <typename DESCRIPTOR, typename T, typename S=T>
@@ -3942,7 +3902,7 @@ __device__ static void apply(
   , std::size_t iX
   , std::size_t iY
   , std::size_t iZ
-  , T* cell_rho 
+  , T* cell_rho
   , T* cell_u
   , T* cell_curl_norm
   , cudaSurfaceObject_t surface
@@ -5116,15 +5076,6 @@ auto wall_mask = materials.mask_of_material(2);
 auto lid_mask  = materials.mask_of_material(3);
 #+END_SRC
 
-All cells are initialized to the equilibrium distribution.
-
-#+BEGIN_SRC cpp :tangle tangle/ldc-2d.cu
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask),
-              Operator(InitializeO(), lid_mask));
-cudaDeviceSynchronize();
-#+END_SRC
-
 The bulk collisions are going to use a relaxation time of \(0.51\) and the lid enacts a velocity of \(0.05\) lattice units.
 This maximum velocity can be used to scale the velocity norm for visualization.
 
@@ -5228,18 +5179,13 @@ CellMaterials<DESCRIPTOR> materials(cuboid, [&cuboid](uint3 p) -> int {
 });
 #+END_SRC
 
-At this point we are ready to generate masks for operator application and to
-initialize all cells with their equilibrium.
+At this point we are ready to generate masks for operator application.
 
 #+BEGIN_SRC cpp :tangle tangle/ldc-3d.cu
 auto bulk_mask = materials.mask_of_material(1);
 auto wall_mask = materials.mask_of_material(2);
 auto lid_mask  = materials.mask_of_material(3);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask),
-              Operator(InitializeO(), lid_mask));
-
 cudaDeviceSynchronize();
 #+END_SRC
 
@@ -5369,15 +5315,7 @@ auto outflow_mask = materials.mask_of_material(4);
 auto edge_mask = materials.mask_of_material(5);
 #+END_SRC
 
-The last step prior to starting the simulation loop is to initialize
-all cells with their equilibrium values.
-
 #+BEGIN_SRC cpp :tangle tangle/magnus.cu
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask),
-              Operator(InitializeO(), inflow_mask),
-              Operator(InitializeO(), outflow_mask),
-              Operator(InitializeO(), edge_mask));
 cudaDeviceSynchronize();
 #+END_SRC
 
@@ -5538,13 +5476,6 @@ auto inflow_mask  = materials.mask_of_material(4);
 auto outflow_mask = materials.mask_of_material(5);
 auto edge_mask    = materials.mask_of_material(6);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask_z),
-              Operator(InitializeO(), wall_mask_y),
-              Operator(InitializeO(), inflow_mask),
-              Operator(InitializeO(), outflow_mask),
-              Operator(InitializeO(), edge_mask));
-
 cudaDeviceSynchronize();
 #+END_SRC
 
@@ -5662,9 +5593,6 @@ auto bulk_list = materials.list_of_material(1);
 auto wall_mask = materials.mask_of_material(2);
 auto wall_list = materials.list_of_material(2);
 
-lattice.apply<InitializeO>(bulk_list);
-lattice.apply<InitializeO>(wall_list);
-
 cudaDeviceSynchronize();
 #+END_SRC
 
@@ -5770,11 +5698,6 @@ auto boundary_mask = materials.mask_of_material(2);
 auto inflow_mask   = materials.mask_of_material(3);
 auto outflow_mask  = materials.mask_of_material(4);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), boundary_mask),
-              Operator(InitializeO(), inflow_mask),
-              Operator(InitializeO(), outflow_mask));
-
 cudaDeviceSynchronize();
 #+END_SRC
 
@@ -5895,14 +5818,6 @@ auto bulk_mask = materials.mask_of_material(1);
 auto box_mask  = materials.mask_of_material(2);
 auto lid_mask  = materials.mask_of_material(3);
 
-auto bulk_cells = materials.list_of_material(1);
-auto box_cells  = materials.list_of_material(2);
-auto lid_cells  = materials.list_of_material(3);
-
-lattice.template apply<InitializeO>(bulk_cells);
-lattice.template apply<InitializeO>(box_cells);
-lattice.template apply<InitializeO>(lid_cells);
-
 cudaDeviceSynchronize();
 #+END_SRC
 
diff --git a/tangle/LLBM/kernel/collect_q_criterion.h b/tangle/LLBM/kernel/collect_q_criterion.h
index fa19dc7..6f770e5 100644
--- a/tangle/LLBM/kernel/collect_q_criterion.h
+++ b/tangle/LLBM/kernel/collect_q_criterion.h
@@ -14,7 +14,7 @@ __device__ static void apply(
   , std::size_t iX
   , std::size_t iY
   , std::size_t iZ
-  , T* cell_rho 
+  , T* cell_rho
   , T* cell_u
   , T* cell_curl_norm
   , cudaSurfaceObject_t surface
diff --git a/tangle/LLBM/kernel/initialize.h b/tangle/LLBM/kernel/initialize.h
deleted file mode 100644
index 221b9ad..0000000
--- a/tangle/LLBM/kernel/initialize.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-#include <LLBM/call_tag.h>
-
-struct InitializeO {
-
-using call_tag = tag::call_by_cell_id;
-
-template <typename T, typename S>
-__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) {
-  f_next[0] = T{0.0277777777777778};
-  f_next[1] = T{0.111111111111111};
-  f_next[2] = T{0.0277777777777778};
-  f_next[3] = T{0.111111111111111};
-  f_next[4] = T{0.444444444444444};
-  f_next[5] = T{0.111111111111111};
-  f_next[6] = T{0.0277777777777778};
-  f_next[7] = T{0.111111111111111};
-  f_next[8] = T{0.0277777777777778};
-}
-
-template <typename T, typename S>
-__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) {
-  f_next[0] = T{0.0277777777777778};
-  f_next[1] = T{0.0277777777777778};
-  f_next[2] = T{0.0555555555555556};
-  f_next[3] = T{0.0277777777777778};
-  f_next[4] = T{0.0277777777777778};
-  f_next[5] = T{0.0277777777777778};
-  f_next[6] = T{0.0555555555555556};
-  f_next[7] = T{0.0277777777777778};
-  f_next[8] = T{0.0555555555555556};
-  f_next[9] = T{0.333333333333333};
-  f_next[10] = T{0.0555555555555556};
-  f_next[11] = T{0.0277777777777778};
-  f_next[12] = T{0.0555555555555556};
-  f_next[13] = T{0.0277777777777778};
-  f_next[14] = T{0.0277777777777778};
-  f_next[15] = T{0.0277777777777778};
-  f_next[16] = T{0.0555555555555556};
-  f_next[17] = T{0.0277777777777778};
-  f_next[18] = T{0.0277777777777778};
-}
-
-};
diff --git a/tangle/LLBM/lattice.h b/tangle/LLBM/lattice.h
index 8ba1d66..7157c78 100644
--- a/tangle/LLBM/lattice.h
+++ b/tangle/LLBM/lattice.h
@@ -5,7 +5,6 @@
 #include "operator.h"
 
 #include "propagate.h"
-#include "kernel/initialize.h"
 #include "kernel/executor.h"
 
 template <typename DESCRIPTOR, typename T, typename S=T>
diff --git a/tangle/LLBM/propagate.h b/tangle/LLBM/propagate.h
index d63ccd8..acb1d6c 100644
--- a/tangle/LLBM/propagate.h
+++ b/tangle/LLBM/propagate.h
@@ -79,7 +79,7 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
 
   for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
     // per-population handle until cuMemMap accepts non-zero offset
-    cuMemCreate(&_handle[iPop], _volume, &_prop, 0); 
+    cuMemCreate(&_handle[iPop], _volume, &_prop, 0);
     cuMemMap(_ptr + iPop * 2 * _volume,           _volume, 0, _handle[iPop], 0);
     cuMemMap(_ptr + iPop * 2 * _volume + _volume, _volume, 0, _handle[iPop], 0);
   }
@@ -88,7 +88,11 @@ CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
   _access.location.id = 0;
   _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1);
-  cuMemsetD8(_ptr, 0, 2 * _volume * DESCRIPTOR::q);
+
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    float eq = descriptor::weight<DESCRIPTOR>(iPop);
+    cuMemsetD32(_ptr + iPop * 2 * _volume, *reinterpret_cast<int*>(&eq), 2 * (_volume / sizeof(S)));
+  }
 
   for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
     _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S));
diff --git a/tangle/benchmark-ldc.cu b/tangle/benchmark-ldc.cu
index 2443afe..4de0ae5 100644
--- a/tangle/benchmark-ldc.cu
+++ b/tangle/benchmark-ldc.cu
@@ -30,14 +30,6 @@ void simulate(descriptor::Cuboid<DESCRIPTOR> cuboid, std::size_t nStep) {
   auto box_mask  = materials.mask_of_material(2);
   auto lid_mask  = materials.mask_of_material(3);
   
-  auto bulk_cells = materials.list_of_material(1);
-  auto box_cells  = materials.list_of_material(2);
-  auto lid_cells  = materials.list_of_material(3);
-  
-  lattice.template apply<InitializeO>(bulk_cells);
-  lattice.template apply<InitializeO>(box_cells);
-  lattice.template apply<InitializeO>(lid_cells);
-  
   cudaDeviceSynchronize();
 
   for (std::size_t iStep=0; iStep < 100; ++iStep) {
diff --git a/tangle/channel-with-sphere.cu b/tangle/channel-with-sphere.cu
index 401f0c9..29cc7de 100644
--- a/tangle/channel-with-sphere.cu
+++ b/tangle/channel-with-sphere.cu
@@ -55,13 +55,6 @@ auto inflow_mask  = materials.mask_of_material(4);
 auto outflow_mask = materials.mask_of_material(5);
 auto edge_mask    = materials.mask_of_material(6);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask_z),
-              Operator(InitializeO(), wall_mask_y),
-              Operator(InitializeO(), inflow_mask),
-              Operator(InitializeO(), outflow_mask),
-              Operator(InitializeO(), edge_mask));
-
 cudaDeviceSynchronize();
 
 VolumetricExample renderer(cuboid);
diff --git a/tangle/ldc-2d.cu b/tangle/ldc-2d.cu
index 8989374..acba98f 100644
--- a/tangle/ldc-2d.cu
+++ b/tangle/ldc-2d.cu
@@ -32,11 +32,6 @@ auto bulk_mask = materials.mask_of_material(1);
 auto wall_mask = materials.mask_of_material(2);
 auto lid_mask  = materials.mask_of_material(3);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask),
-              Operator(InitializeO(), lid_mask));
-cudaDeviceSynchronize();
-
 const float tau = 0.51;
 const float u_lid = 0.05;
 
diff --git a/tangle/ldc-3d.cu b/tangle/ldc-3d.cu
index ece1234..e9b42f2 100644
--- a/tangle/ldc-3d.cu
+++ b/tangle/ldc-3d.cu
@@ -34,10 +34,6 @@ auto bulk_mask = materials.mask_of_material(1);
 auto wall_mask = materials.mask_of_material(2);
 auto lid_mask  = materials.mask_of_material(3);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask),
-              Operator(InitializeO(), lid_mask));
-
 cudaDeviceSynchronize();
 
 auto none = [] __device__ (float3) -> float { return 1; };
diff --git a/tangle/magnus.cu b/tangle/magnus.cu
index 5800cd8..08a4515 100644
--- a/tangle/magnus.cu
+++ b/tangle/magnus.cu
@@ -64,11 +64,6 @@ auto inflow_mask  = materials.mask_of_material(3);
 auto outflow_mask = materials.mask_of_material(4);
 auto edge_mask = materials.mask_of_material(5);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), wall_mask),
-              Operator(InitializeO(), inflow_mask),
-              Operator(InitializeO(), outflow_mask),
-              Operator(InitializeO(), edge_mask));
 cudaDeviceSynchronize();
 
 RenderWindow window("Magnus");
diff --git a/tangle/nozzle.cu b/tangle/nozzle.cu
index 5a1b1f3..03c18f9 100644
--- a/tangle/nozzle.cu
+++ b/tangle/nozzle.cu
@@ -46,11 +46,6 @@ auto boundary_mask = materials.mask_of_material(2);
 auto inflow_mask   = materials.mask_of_material(3);
 auto outflow_mask  = materials.mask_of_material(4);
 
-lattice.apply(Operator(InitializeO(), bulk_mask),
-              Operator(InitializeO(), boundary_mask),
-              Operator(InitializeO(), inflow_mask),
-              Operator(InitializeO(), outflow_mask));
-
 cudaDeviceSynchronize();
 
 VolumetricExample renderer(cuboid);
diff --git a/tangle/taylor-couette.cu b/tangle/taylor-couette.cu
index 48b0d87..2e69bfb 100644
--- a/tangle/taylor-couette.cu
+++ b/tangle/taylor-couette.cu
@@ -55,9 +55,6 @@ auto bulk_list = materials.list_of_material(1);
 auto wall_mask = materials.mask_of_material(2);
 auto wall_list = materials.list_of_material(2);
 
-lattice.apply<InitializeO>(bulk_list);
-lattice.apply<InitializeO>(wall_list);
-
 cudaDeviceSynchronize();
 
 VolumetricExample renderer(cuboid);
-- 
cgit v1.2.3