From 4ec94c97879aafef15f7663135745e4ba61e62cf Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Mon, 17 May 2021 00:15:33 +0200
Subject: Extract first public LiterateLB version

---
 tangle/LLBM/kernel/bounce_back.h                |  44 +++++
 tangle/LLBM/kernel/bounce_back_moving_wall.h    |  44 +++++
 tangle/LLBM/kernel/bouzidi.h                    |  39 ++++
 tangle/LLBM/kernel/collect_curl.h               |  44 +++++
 tangle/LLBM/kernel/collect_moments.h            |  45 +++++
 tangle/LLBM/kernel/collect_q_criterion.h        | 104 +++++++++++
 tangle/LLBM/kernel/collect_shear_layer_normal.h | 139 +++++++++++++++
 tangle/LLBM/kernel/collect_streamlines.h        |  48 +++++
 tangle/LLBM/kernel/collect_velocity_norm.h      |  61 +++++++
 tangle/LLBM/kernel/collide.h                    | 131 ++++++++++++++
 tangle/LLBM/kernel/equilibrium_density_wall.h   | 226 ++++++++++++++++++++++++
 tangle/LLBM/kernel/equilibrium_velocity_wall.h  | 222 +++++++++++++++++++++++
 tangle/LLBM/kernel/executor.h                   | 171 ++++++++++++++++++
 tangle/LLBM/kernel/free_slip.h                  |  82 +++++++++
 tangle/LLBM/kernel/initialize.h                 |  44 +++++
 tangle/LLBM/kernel/propagate.h                  |  20 +++
 tangle/LLBM/kernel/smagorinsky_collide.h        | 183 +++++++++++++++++++
 17 files changed, 1647 insertions(+)
 create mode 100644 tangle/LLBM/kernel/bounce_back.h
 create mode 100644 tangle/LLBM/kernel/bounce_back_moving_wall.h
 create mode 100644 tangle/LLBM/kernel/bouzidi.h
 create mode 100644 tangle/LLBM/kernel/collect_curl.h
 create mode 100644 tangle/LLBM/kernel/collect_moments.h
 create mode 100644 tangle/LLBM/kernel/collect_q_criterion.h
 create mode 100644 tangle/LLBM/kernel/collect_shear_layer_normal.h
 create mode 100644 tangle/LLBM/kernel/collect_streamlines.h
 create mode 100644 tangle/LLBM/kernel/collect_velocity_norm.h
 create mode 100644 tangle/LLBM/kernel/collide.h
 create mode 100644 tangle/LLBM/kernel/equilibrium_density_wall.h
 create mode 100644 tangle/LLBM/kernel/equilibrium_velocity_wall.h
 create mode 100644 tangle/LLBM/kernel/executor.h
 create mode 100644 tangle/LLBM/kernel/free_slip.h
 create mode 100644 tangle/LLBM/kernel/initialize.h
 create mode 100644 tangle/LLBM/kernel/propagate.h
 create mode 100644 tangle/LLBM/kernel/smagorinsky_collide.h

(limited to 'tangle/LLBM/kernel')
diff --git a/tangle/LLBM/kernel/bounce_back.h b/tangle/LLBM/kernel/bounce_back.h
new file mode 100644
index 0000000..d7ec4a7
--- /dev/null
+++ b/tangle/LLBM/kernel/bounce_back.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct BounceBackO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) {
+  f_next[0] = f_curr[8];
+  f_next[1] = f_curr[7];
+  f_next[2] = f_curr[6];
+  f_next[3] = f_curr[5];
+  f_next[4] = f_curr[4];
+  f_next[5] = f_curr[3];
+  f_next[6] = f_curr[2];
+  f_next[7] = f_curr[1];
+  f_next[8] = f_curr[0];
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) {
+  f_next[0] = f_curr[18];
+  f_next[1] = f_curr[17];
+  f_next[2] = f_curr[16];
+  f_next[3] = f_curr[15];
+  f_next[4] = f_curr[14];
+  f_next[5] = f_curr[13];
+  f_next[6] = f_curr[12];
+  f_next[7] = f_curr[11];
+  f_next[8] = f_curr[10];
+  f_next[9] = f_curr[9];
+  f_next[10] = f_curr[8];
+  f_next[11] = f_curr[7];
+  f_next[12] = f_curr[6];
+  f_next[13] = f_curr[5];
+  f_next[14] = f_curr[4];
+  f_next[15] = f_curr[3];
+  f_next[16] = f_curr[2];
+  f_next[17] = f_curr[1];
+  f_next[18] = f_curr[0];
+}
+
+};
diff --git a/tangle/LLBM/kernel/bounce_back_moving_wall.h b/tangle/LLBM/kernel/bounce_back_moving_wall.h
new file mode 100644
index 0000000..c4b40f9
--- /dev/null
+++ b/tangle/LLBM/kernel/bounce_back_moving_wall.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct BounceBackMovingWallO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T u_0, T u_1) {
+  f_next[0] = -T{0.166666666666667}*u_0 - T{0.166666666666667}*u_1 + f_curr[8];
+  f_next[1] = -T{0.666666666666667}*u_0 + f_curr[7];
+  f_next[2] = -T{0.166666666666667}*u_0 + T{0.166666666666667}*u_1 + f_curr[6];
+  f_next[3] = -T{0.666666666666667}*u_1 + f_curr[5];
+  f_next[4] = f_curr[4];
+  f_next[5] = T{0.666666666666667}*u_1 + f_curr[3];
+  f_next[6] = T{0.166666666666667}*u_0 - T{0.166666666666667}*u_1 + f_curr[2];
+  f_next[7] = T{0.666666666666667}*u_0 + f_curr[1];
+  f_next[8] = T{0.166666666666667}*u_0 + T{0.166666666666667}*u_1 + f_curr[0];
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T u_0, T u_1, T u_2) {
+  f_next[0] = T{0.166666666666667}*u_1 + T{0.166666666666667}*u_2 + f_curr[18];
+  f_next[1] = -T{0.166666666666667}*u_0 + T{0.166666666666667}*u_2 + f_curr[17];
+  f_next[2] = T{0.333333333333333}*u_2 + f_curr[16];
+  f_next[3] = T{0.166666666666667}*u_0 + T{0.166666666666667}*u_2 + f_curr[15];
+  f_next[4] = -T{0.166666666666667}*u_1 + T{0.166666666666667}*u_2 + f_curr[14];
+  f_next[5] = -T{0.166666666666667}*u_0 + T{0.166666666666667}*u_1 + f_curr[13];
+  f_next[6] = T{0.333333333333333}*u_1 + f_curr[12];
+  f_next[7] = T{0.166666666666667}*u_0 + T{0.166666666666667}*u_1 + f_curr[11];
+  f_next[8] = -T{0.333333333333333}*u_0 + f_curr[10];
+  f_next[9] = f_curr[9];
+  f_next[10] = T{0.333333333333333}*u_0 + f_curr[8];
+  f_next[11] = -T{0.166666666666667}*u_0 - T{0.166666666666667}*u_1 + f_curr[7];
+  f_next[12] = -T{0.333333333333333}*u_1 + f_curr[6];
+  f_next[13] = T{0.166666666666667}*u_0 - T{0.166666666666667}*u_1 + f_curr[5];
+  f_next[14] = T{0.166666666666667}*u_1 - T{0.166666666666667}*u_2 + f_curr[4];
+  f_next[15] = -T{0.166666666666667}*u_0 - T{0.166666666666667}*u_2 + f_curr[3];
+  f_next[16] = -T{0.333333333333333}*u_2 + f_curr[2];
+  f_next[17] = T{0.166666666666667}*u_0 - T{0.166666666666667}*u_2 + f_curr[1];
+  f_next[18] = -T{0.166666666666667}*u_1 - T{0.166666666666667}*u_2 + f_curr[0];
+}
+
+};
diff --git a/tangle/LLBM/kernel/bouzidi.h b/tangle/LLBM/kernel/bouzidi.h
new file mode 100644
index 0000000..b1abd2c
--- /dev/null
+++ b/tangle/LLBM/kernel/bouzidi.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/lattice.h>
+
+template <typename S>
+struct BouzidiConfig {
+  std::size_t* boundary; // boundary cell to be interpolated
+  std::size_t* solid;    // adjacent solid cell
+  std::size_t* fluid;    // adjacent fluid cell
+  S* distance;           // precomputed distance factor q
+  S* correction;         // correction for moving walls
+  pop_index_t* missing;  // population to be reconstructed
+};
+
+struct BouzidiO {
+
+using call_tag = tag::call_by_list_index;
+
+template <typename T, typename S, typename DESCRIPTOR>
+__device__ static void apply(
+    LatticeView<DESCRIPTOR,S> lattice
+  , std::size_t index
+  , std::size_t count
+  , BouzidiConfig<S> config
+) {
+  pop_index_t& iPop = config.missing[index];
+  pop_index_t  jPop = descriptor::opposite<DESCRIPTOR>(iPop);
+  pop_index_t  kPop = config.boundary[index] == config.fluid[index] ? iPop : jPop;
+
+  S f_bound_j = *lattice.pop(jPop, config.boundary[index]);
+  S f_fluid_j = *lattice.pop(kPop, config.fluid[index]);
+  S* f_next_i =  lattice.pop(iPop, config.solid[index]);
+
+  *f_next_i = config.distance[index] * f_bound_j
+            + (1. - config.distance[index]) * f_fluid_j
+            + config.correction[index];
+}
+
+};
diff --git a/tangle/LLBM/kernel/collect_curl.h b/tangle/LLBM/kernel/collect_curl.h
new file mode 100644
index 0000000..054b359
--- /dev/null
+++ b/tangle/LLBM/kernel/collect_curl.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct CollectCurlF {
+
+using call_tag = tag::call_by_spatial_cell_mask;
+
+template <typename T, typename S>
+__device__ static void apply(
+    descriptor::D3Q19
+  , S f_curr[19]
+  , descriptor::CuboidD<3> cuboid
+  , std::size_t gid
+  , std::size_t iX
+  , std::size_t iY
+  , std::size_t iZ
+  , S* moments_u
+  , cudaSurfaceObject_t surface
+  , S* curl_norm = nullptr
+) {
+  auto u_x = [moments_u,cuboid,gid] __device__ (int x, int y, int z) -> T {
+    return moments_u[3*(gid + descriptor::offset(cuboid,x,y,z)) + 0];
+  };
+  auto u_y = [moments_u,cuboid,gid] __device__ (int x, int y, int z) -> T {
+    return moments_u[3*(gid + descriptor::offset(cuboid,x,y,z)) + 1];
+  };
+  auto u_z = [moments_u,cuboid,gid] __device__ (int x, int y, int z) -> T {
+    return moments_u[3*(gid + descriptor::offset(cuboid,x,y,z)) + 2];
+  };
+
+  T curl_0 = T{0.500000000000000}*u_y(0, 0, -1) - T{0.500000000000000}*u_y(0, 0, 1) - T{0.500000000000000}*u_z(0, -1, 0) + T{0.500000000000000}*u_z(0, 1, 0);
+  T curl_1 = -T{0.500000000000000}*u_x(0, 0, -1) + T{0.500000000000000}*u_x(0, 0, 1) + T{0.500000000000000}*u_z(-1, 0, 0) - T{0.500000000000000}*u_z(1, 0, 0);
+  T curl_2 = T{0.500000000000000}*u_x(0, -1, 0) - T{0.500000000000000}*u_x(0, 1, 0) - T{0.500000000000000}*u_y(-1, 0, 0) + T{0.500000000000000}*u_y(1, 0, 0);
+  float3 curl = make_float3(curl_0, curl_1, curl_2);
+  float norm = length(curl);
+
+  surf3Dwrite(norm, surface, iX*sizeof(float), iY, iZ);
+
+  if (curl_norm != nullptr) {
+    curl_norm[gid] = norm; 
+  }
+}
+
+};
diff --git a/tangle/LLBM/kernel/collect_moments.h b/tangle/LLBM/kernel/collect_moments.h
new file mode 100644
index 0000000..b2ef09b
--- /dev/null
+++ b/tangle/LLBM/kernel/collect_moments.h
@@ -0,0 +1,45 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct CollectMomentsF {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], std::size_t gid, T* cell_rho, T* cell_u) {
+  T m0 = f_curr[1] + f_curr[2];
+  T m1 = f_curr[3] + f_curr[6];
+  T m2 = m0 + m1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8];
+  T m3 = f_curr[0] - f_curr[8];
+  T m4 = T{1} / (m2);
+  T rho = m2;
+  T u_0 = -m4*(m0 + m3 - f_curr[6] - f_curr[7]);
+  T u_1 = -m4*(m1 + m3 - f_curr[2] - f_curr[5]);
+
+  cell_rho[gid] = rho;
+  cell_u[2*gid+0] = u_0;
+  cell_u[2*gid+1] = u_1;
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], std::size_t gid, T* cell_rho, T* cell_u) {
+  T m0 = f_curr[10] + f_curr[13] + f_curr[17];
+  T m1 = f_curr[14] + f_curr[5] + f_curr[6];
+  T m2 = f_curr[1] + f_curr[2] + f_curr[4];
+  T m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+  T m4 = -f_curr[11] + f_curr[7];
+  T m5 = -f_curr[15] + f_curr[3];
+  T m6 = T{1} / (m3);
+  T m7 = f_curr[0] - f_curr[18];
+  T rho = m3;
+  T u_0 = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]);
+  T u_1 = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]);
+  T u_2 = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]);
+
+  cell_rho[gid] = rho;
+  cell_u[3*gid+0] = u_0;
+  cell_u[3*gid+1] = u_1;
+  cell_u[3*gid+2] = u_2;
+}
+
+};
diff --git a/tangle/LLBM/kernel/collect_q_criterion.h b/tangle/LLBM/kernel/collect_q_criterion.h
new file mode 100644
index 0000000..19b7f68
--- /dev/null
+++ b/tangle/LLBM/kernel/collect_q_criterion.h
@@ -0,0 +1,104 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct CollectQCriterionF {
+
+using call_tag = tag::call_by_spatial_cell_mask;
+
+template <typename T, typename S>
+__device__ static void apply(
+    descriptor::D3Q19
+  , S f_curr[19]
+  , descriptor::CuboidD<3> cuboid
+  , std::size_t gid
+  , std::size_t iX
+  , std::size_t iY
+  , std::size_t iZ
+  , T* cell_rho 
+  , T* cell_u
+  , T* cell_curl_norm
+  , cudaSurfaceObject_t surface
+  , T* cell_q = nullptr
+) {
+  const T rho = cell_rho[gid];
+  const T u_0 = cell_u[3*gid + 0];
+  const T u_1 = cell_u[3*gid + 1];
+  const T u_2 = cell_u[3*gid + 2];
+
+  T x0 = T{72.0000000000000}*f_curr[5];
+  T x1 = T{72.0000000000000}*f_curr[13];
+  T x2 = T{6.00000000000000}*u_1;
+  T x3 = -x2;
+  T x4 = -u_0;
+  T x5 = x4 + u_1;
+  T x6 = u_1*u_1;
+  T x7 = T{3.00000000000000}*x6;
+  T x8 = u_0*u_0;
+  T x9 = T{3.00000000000000}*x8;
+  T x10 = x9 + T{-2.00000000000000};
+  T x11 = x10 + x7;
+  T x12 = u_2*u_2;
+  T x13 = T{3.00000000000000}*x12;
+  T x14 = T{6.00000000000000}*u_0;
+  T x15 = x13 + x14;
+  T x16 = rho*(x11 + x15 + x3 - T{9.00000000000000}*x5*x5);
+  T x17 = -x14;
+  T x18 = -u_1;
+  T x19 = x18 + u_0;
+  T x20 = x13 + x2;
+  T x21 = x11 + x20;
+  T x22 = rho*(x17 + x21 - T{9.00000000000000}*x19*x19);
+  T x23 = u_0 + u_1;
+  T x24 = T{9.00000000000000}*(x23*x23);
+  T x25 = -x7;
+  T x26 = T{2.00000000000000} - x9;
+  T x27 = x25 + x26;
+  T x28 = -x13;
+  T x29 = x2 + x28;
+  T x30 = -rho*(x14 + x21 - x24) + rho*(x14 + x24 + x27 + x29) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
+  T x31 = T{72.0000000000000}*f_curr[1];
+  T x32 = T{72.0000000000000}*f_curr[17];
+  T x33 = x4 + u_2;
+  T x34 = T{6.00000000000000}*u_2;
+  T x35 = x11 - x34;
+  T x36 = rho*(x15 + x35 - T{9.00000000000000}*x33*x33);
+  T x37 = -u_2;
+  T x38 = x37 + u_0;
+  T x39 = x11 + x34;
+  T x40 = x13 + x39;
+  T x41 = rho*(x17 + x40 - T{9.00000000000000}*x38*x38);
+  T x42 = u_0 + u_2;
+  T x43 = T{9.00000000000000}*(x42*x42);
+  T x44 = x27 + x34;
+  T x45 = x14 + x28;
+  T x46 = -rho*(x15 + x39 - x43) + rho*(x43 + x44 + x45) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
+  T x47 = T{72.0000000000000}*f_curr[4];
+  T x48 = T{72.0000000000000}*f_curr[14];
+  T x49 = x18 + u_2;
+  T x50 = rho*(x20 + x35 - T{9.00000000000000}*x49*x49);
+  T x51 = x37 + u_1;
+  T x52 = rho*(x3 + x40 - T{9.00000000000000}*x51*x51);
+  T x53 = u_1 + u_2;
+  T x54 = T{9.00000000000000}*(x53*x53);
+  T x55 = -rho*(x20 + x39 - x54) + rho*(x29 + x44 + x54) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
+  T x56 = T{2.00000000000000}*rho;
+  T x57 = T{6.00000000000000}*x8;
+  T x58 = -x31 - x32 - x36 - x41 + x46;
+  T x59 = -x0 - x1 - x16 - x22 + x30;
+  T x60 = T{6.00000000000000}*x6;
+  T x61 = -x47 - x48 - x50 - x52 + x55;
+  T x62 = T{6.00000000000000}*x12;
+  T strain = T{0.0277777777777778}*sqrt((x0 + x1 + x16 + x22 + x30)*(x0 + x1 + x16 + x22 + x30) + (x31 + x32 + x36 + x41 + x46)*(x31 + x32 + x36 + x41 + x46) + (x47 + x48 + x50 + x52 + x55)*(x47 + x48 + x50 + x52 + x55) + T{0.500000000000000}*((-x56*(x39 - x62) + x56*(x44 + x62) + x58 + x61 - 72*f_curr[16] - 72*f_curr[2])*(-x56*(x39 - x62) + x56*(x44 + x62) + x58 + x61 - 72*f_curr[16] - 72*f_curr[2])) + T{0.500000000000000}*((-x56*(x10 + x20 - x60) + x56*(x26 + x29 + x60) + x59 + x61 - 72*f_curr[12] - 72*f_curr[6])*(-x56*(x10 + x20 - x60) + x56*(x26 + x29 + x60) + x59 + x61 - 72*f_curr[12] - 72*f_curr[6])) + T{0.500000000000000}*((-x56*(x15 - x57 + x7 - 2) + x56*(x25 + x45 + x57 + 2) + x58 + x59 - 72*f_curr[10] - 72*f_curr[8])*(-x56*(x15 - x57 + x7 - 2) + x56*(x25 + x45 + x57 + 2) + x58 + x59 - 72*f_curr[10] - 72*f_curr[8])));
+
+  float vorticity = cell_curl_norm[gid];
+  float q = vorticity*vorticity - strain*strain;
+  q = q > 0 ? q : 0;
+
+  surf3Dwrite(q, surface, iX*sizeof(float), iY, iZ);
+
+  if (cell_q != nullptr) {
+    cell_q[gid] = q;
+  }
+}
+
+};
diff --git a/tangle/LLBM/kernel/collect_shear_layer_normal.h b/tangle/LLBM/kernel/collect_shear_layer_normal.h
new file mode 100644
index 0000000..7bf6eff
--- /dev/null
+++ b/tangle/LLBM/kernel/collect_shear_layer_normal.h
@@ -0,0 +1,139 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct CollectShearLayerNormalsF {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(
+    descriptor::D3Q19
+  , S f_curr[19]
+  , std::size_t gid
+  , T* cell_rho 
+  , T* cell_u
+  , T* cell_shear_normal
+) {
+  T x0 = f_curr[10] + f_curr[13] + f_curr[17];
+  T x1 = f_curr[14] + f_curr[5] + f_curr[6];
+  T x2 = f_curr[1] + f_curr[2] + f_curr[4];
+  T x3 = x0 + x1 + x2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+  T x4 = -f_curr[11] + f_curr[7];
+  T x5 = -f_curr[15] + f_curr[3];
+  T x6 = T{1} / (x3);
+  T x7 = f_curr[0] - f_curr[18];
+  T x14 = T{72.0000000000000}*f_curr[5];
+  T x15 = T{72.0000000000000}*f_curr[13];
+  T x39 = T{72.0000000000000}*f_curr[1];
+  T x40 = T{72.0000000000000}*f_curr[17];
+  T x61 = T{72.0000000000000}*f_curr[4];
+  T x62 = T{72.0000000000000}*f_curr[14];
+  T rho = x3;
+  T x56 = T{2.00000000000000}*rho;
+  T u_0 = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]);
+  T x9 = u_0*u_0;
+  T x16 = -u_0;
+  T x18 = -T{3.00000000000000}*x9;
+  T x21 = T{6.00000000000000}*u_0;
+  T x22 = -x21;
+  T x55 = T{0.0277777777777778}*u_0;
+  T u_1 = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]);
+  T x8 = T{0.0277777777777778}*u_1;
+  T x10 = u_1*u_1;
+  T x17 = x16 + u_1;
+  T x19 = T{6.00000000000000}*u_1;
+  T x20 = x18 + x19;
+  T x23 = -T{3.00000000000000}*x10;
+  T x28 = -u_1;
+  T x29 = x28 + u_0;
+  T x30 = x18 - x19;
+  T x33 = u_0 + u_1;
+  T x34 = T{9.00000000000000}*(x33*x33);
+  T u_2 = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]);
+  T x11 = u_2*u_2;
+  T x12 = x10 + x11 + x9;
+  T x13 = pow(x12, T{-0.500000000000000});
+  T x24 = T{2.00000000000000} - T{3.00000000000000}*x11;
+  T x25 = x23 + x24;
+  T x26 = x22 + x25;
+  T x27 = rho*(x20 + x26 + T{9.00000000000000}*(x17*x17));
+  T x31 = x21 + x25;
+  T x32 = rho*(x30 + x31 + T{9.00000000000000}*(x29*x29));
+  T x35 = rho*(x20 + x31 + x34) + rho*(x26 + x30 + x34) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
+  T x36 = x14 + x15 - x27 - x32 + x35;
+  T x37 = x13*x36;
+  T x38 = T{0.0277777777777778}*u_2;
+  T x41 = x16 + u_2;
+  T x42 = T{6.00000000000000}*u_2;
+  T x43 = x18 + x42;
+  T x44 = rho*(x26 + x43 + T{9.00000000000000}*(x41*x41));
+  T x45 = -u_2;
+  T x46 = x45 + u_0;
+  T x47 = -x42;
+  T x48 = x18 + x47;
+  T x49 = rho*(x31 + x48 + T{9.00000000000000}*(x46*x46));
+  T x50 = u_0 + u_2;
+  T x51 = T{9.00000000000000}*(x50*x50);
+  T x52 = rho*(x26 + x48 + x51) + rho*(x31 + x43 + x51) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
+  T x53 = x39 + x40 - x44 - x49 + x52;
+  T x54 = x13*x53;
+  T x57 = x25 + T{6.00000000000000}*x9;
+  T x58 = -x14 - x15 + x27 + x32 + x35;
+  T x59 = -x39 - x40 + x44 + x49 + x52;
+  T x60 = x56*(x21 + x57) + x56*(x22 + x57) + x58 + x59 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8];
+  T x63 = x28 + u_2;
+  T x64 = x25 + x30;
+  T x65 = rho*(x42 + x64 + T{9.00000000000000}*(x63*x63));
+  T x66 = x45 + u_1;
+  T x67 = x20 + x25;
+  T x68 = rho*(x47 + x67 + T{9.00000000000000}*(x66*x66));
+  T x69 = u_1 + u_2;
+  T x70 = T{9.00000000000000}*(x69*x69);
+  T x71 = rho*(x42 + x67 + x70) + rho*(x47 + x64 + x70) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
+  T x72 = x61 + x62 - x65 - x68 + x71;
+  T x73 = T{6.00000000000000}*x10 + x24;
+  T x74 = -x61 - x62 + x65 + x68 + x71;
+  T x75 = x56*(x20 + x73) + x56*(x30 + x73) + x58 + x74 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6];
+  T x76 = T{6.00000000000000}*x11 + x23 + T{2.00000000000000};
+  T x77 = x56*(x43 + x76) + x56*(x48 + x76) + x59 + x74 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2];
+  T x78 = ((x36*u_0 + x72*u_2 + x75*u_1)*u_1 + (x36*u_1 + x53*u_2 + x60*u_0)*u_0 + (x53*u_0 + x72*u_1 + x77*u_2)*u_2)/x12;
+  T x79 = x13*x72;
+  T n_0 = -x13*x55*x60 - x37*x8 - x38*x54 + x55*x78;
+  T n_1 = -x13*x75*x8 - x37*x55 - x38*x79 + x78*x8;
+  T n_2 = -x13*x38*x77 + x38*x78 - x54*x55 - x79*x8;
+
+  cell_rho[gid] = rho;
+
+  cell_u[3*gid+0] = u_0;
+  cell_u[3*gid+1] = u_1;
+  cell_u[3*gid+2] = u_2;
+
+  float3 n = normalize(make_float3(n_0, n_1, n_2));
+  cell_shear_normal[3*gid+0] = n.x;
+  cell_shear_normal[3*gid+1] = n.y;
+  cell_shear_normal[3*gid+2] = n.z;
+}
+
+};
+
+struct CollectShearLayerVisibilityF {
+
+using call_tag = tag::post_process_by_spatial_cell_mask;
+
+template <typename T, typename S>
+__device__ static void apply(
+    descriptor::D3Q19
+  , std::size_t gid
+  , std::size_t iX
+  , std::size_t iY
+  , std::size_t iZ
+  , T* shear_normal
+  , float3 view_direction
+  , cudaSurfaceObject_t surface
+) {
+  float3 n = make_float3(shear_normal[3*gid+0], shear_normal[3*gid+1], shear_normal[3*gid+2]);
+  float visibility = dot(n, view_direction);
+  surf3Dwrite(visibility, surface, iX*sizeof(float), iY, iZ);
+}
+
+};
diff --git a/tangle/LLBM/kernel/collect_streamlines.h b/tangle/LLBM/kernel/collect_streamlines.h
new file mode 100644
index 0000000..c787713
--- /dev/null
+++ b/tangle/LLBM/kernel/collect_streamlines.h
@@ -0,0 +1,48 @@
+#pragma once
+
+__device__ void writeDot(cudaSurfaceObject_t texture, int x, int y, uchar4 pixel) {
+  surf2Dwrite(pixel, texture, (x-1)*sizeof(uchar4), (y  ));
+  surf2Dwrite(pixel, texture, (x-1)*sizeof(uchar4), (y+1));
+  surf2Dwrite(pixel, texture, (x  )*sizeof(uchar4), (y  ));
+  surf2Dwrite(pixel, texture, (x  )*sizeof(uchar4), (y+1));
+} 
+
+__device__ float2 readVelocity(cudaSurfaceObject_t u_x, cudaSurfaceObject_t u_y, float2 p) {
+  return make_float2(tex2D<float>(u_x, p.x, p.y), tex2D<float>(u_y, p.x, p.y));
+}
+
+template <typename DESCRIPTOR, typename T>
+__global__ void renderStreamlinesToTexture(
+    descriptor::Cuboid<DESCRIPTOR> cuboid
+  , cudaSurfaceObject_t u_x
+  , cudaSurfaceObject_t u_y
+  , T* origins
+  , unsigned nSteps
+  , float charU
+  , cudaSurfaceObject_t texture
+)  {
+  const int iOrigin = threadIdx.x + blockIdx.x * blockDim.x;
+
+  float h = 1;
+  float2 p = make_float2(origins[2*iOrigin+0], origins[2*iOrigin+1]);
+
+  for (unsigned iStep = 0; iStep < nSteps; ++iStep) {
+    float2 u = make_float2(tex2D<float>(u_x, p.x, p.y), tex2D<float>(u_y, p.x, p.y));
+
+    float2 k1 = readVelocity(u_x, u_y, p) / charU;
+    float2 k2 = readVelocity(u_x, u_y, p + 0.5f*h * k1) / charU;
+    float2 k3 = readVelocity(u_x, u_y, p + 0.5f*h * k2) / charU;
+    float2 k4 = readVelocity(u_x, u_y, p +      h * k3) / charU;
+
+    p += h * (1.f/6.f*k1 + 1.f/3.f*k2 + 1.f/3.f*k3 + 1.f/6.f*k4);
+
+    const int screenX = std::nearbyint(p.x);
+    const int screenY = cuboid.nY-1 - std::nearbyint(p.y);
+
+    if (screenX < 1 || screenY < 1 || screenX > cuboid.nX-2 || screenY > cuboid.nY-2) {
+      break;
+    }
+
+    writeDot(texture, screenX, screenY, {255,255,255,255});
+  }
+}
diff --git a/tangle/LLBM/kernel/collect_velocity_norm.h b/tangle/LLBM/kernel/collect_velocity_norm.h
new file mode 100644
index 0000000..ea099a8
--- /dev/null
+++ b/tangle/LLBM/kernel/collect_velocity_norm.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct CollectVelocityNormF {
+
+using call_tag = tag::post_process_by_spatial_cell_mask;
+
+template <typename T, typename S>
+__device__ static void apply(
+    descriptor::D2Q9
+  , std::size_t gid
+  , std::size_t iX
+  , std::size_t iY
+  , std::size_t iZ
+  , T* u
+  , cudaSurfaceObject_t surface
+) {
+  float norm = length(make_float2(u[2*gid+0], u[2*gid+1]));
+  surf2Dwrite(norm, surface, iX*sizeof(float), iY);
+}
+
+template <typename T, typename S>
+__device__ static void apply(
+    descriptor::D3Q19
+  , std::size_t gid
+  , std::size_t iX
+  , std::size_t iY
+  , std::size_t iZ
+  , T* u
+  , cudaSurfaceObject_t surface
+  , T* u_norm = nullptr
+) {
+  float norm = length(make_float3(u[3*gid+0], u[3*gid+1], u[3*gid+2]));
+  surf3Dwrite(norm, surface, iX*sizeof(float), iY, iZ);
+  if (u_norm != nullptr) {
+    u_norm[gid] = norm;
+  }
+}
+
+};
+
+template <typename SLICE, typename SAMPLE, typename PALETTE>
+__global__ void renderSliceViewToTexture(std::size_t width, std::size_t height, SLICE slice, SAMPLE sample, PALETTE palette, cudaSurfaceObject_t texture) {
+  const int screenX = threadIdx.x + blockIdx.x * blockDim.x;
+  const int screenY = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (screenX > width-1 || screenY > height-1) {
+    return;
+  }
+  
+  const std::size_t gid = slice(screenX,screenY);
+  float3 color = palette(sample(gid));
+
+  uchar4 pixel {
+    static_cast<unsigned char>(color.x * 255),
+    static_cast<unsigned char>(color.y * 255),
+    static_cast<unsigned char>(color.z * 255),
+    255
+  };
+  surf2Dwrite(pixel, texture, screenX*sizeof(uchar4), screenY);
+}
diff --git a/tangle/LLBM/kernel/collide.h b/tangle/LLBM/kernel/collide.h
new file mode 100644
index 0000000..7ca02bf
--- /dev/null
+++ b/tangle/LLBM/kernel/collide.h
@@ -0,0 +1,131 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct BgkCollideO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T tau) {
+  T m0 = f_curr[1] + f_curr[2];
+  T m1 = f_curr[3] + f_curr[6];
+  T m2 = m0 + m1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8];
+  T m3 = f_curr[0] - f_curr[8];
+  T m4 = T{1} / (m2);
+  T rho = m2;
+  T u_0 = -m4*(m0 + m3 - f_curr[6] - f_curr[7]);
+  T u_1 = -m4*(m1 + m3 - f_curr[2] - f_curr[5]);
+  T x0 = T{1} / (tau);
+  T x1 = T{0.0138888888888889}*x0;
+  T x2 = T{6.00000000000000}*u_1;
+  T x3 = T{6.00000000000000}*u_0;
+  T x4 = u_0 + u_1;
+  T x5 = T{9.00000000000000}*(x4*x4);
+  T x6 = u_1*u_1;
+  T x7 = T{3.00000000000000}*x6;
+  T x8 = u_0*u_0;
+  T x9 = T{3.00000000000000}*x8;
+  T x10 = x9 + T{-2.00000000000000};
+  T x11 = x10 + x7;
+  T x12 = T{0.0555555555555556}*x0;
+  T x13 = -x3;
+  T x14 = T{2.00000000000000} - x7;
+  T x15 = x14 + T{6.00000000000000}*x8;
+  T x16 = -x9;
+  T x17 = x16 + x2;
+  T x18 = u_0 - u_1;
+  T x19 = x14 + T{9.00000000000000}*(x18*x18);
+  T x20 = T{6.00000000000000}*x6;
+  f_next[0] = -x1*(rho*(x11 + x2 + x3 - x5) + T{72.0000000000000}*f_curr[0]) + f_curr[0];
+  f_next[1] = x12*(rho*(x13 + x15) - T{18.0000000000000}*f_curr[1]) + f_curr[1];
+  f_next[2] = x1*(rho*(x13 + x17 + x19) - T{72.0000000000000}*f_curr[2]) + f_curr[2];
+  f_next[3] = -x12*(rho*(x10 + x2 - x20) + T{18.0000000000000}*f_curr[3]) + f_curr[3];
+  f_next[4] = -T{0.111111111111111}*x0*(T{2.00000000000000}*rho*x11 + T{9.00000000000000}*f_curr[4]) + f_curr[4];
+  f_next[5] = x12*(rho*(x17 + x20 + T{2.00000000000000}) - T{18.0000000000000}*f_curr[5]) + f_curr[5];
+  f_next[6] = x1*(rho*(x16 + x19 - x2 + x3) - T{72.0000000000000}*f_curr[6]) + f_curr[6];
+  f_next[7] = x12*(rho*(x15 + x3) - T{18.0000000000000}*f_curr[7]) + f_curr[7];
+  f_next[8] = x1*(rho*(x14 + x17 + x3 + x5) - T{72.0000000000000}*f_curr[8]) + f_curr[8];
+  
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T tau) {
+  T m0 = f_curr[10] + f_curr[13] + f_curr[17];
+  T m1 = f_curr[14] + f_curr[5] + f_curr[6];
+  T m2 = f_curr[1] + f_curr[2] + f_curr[4];
+  T m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+  T m4 = -f_curr[11] + f_curr[7];
+  T m5 = -f_curr[15] + f_curr[3];
+  T m6 = T{1} / (m3);
+  T m7 = f_curr[0] - f_curr[18];
+  T rho = m3;
+  T u_0 = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]);
+  T u_1 = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]);
+  T u_2 = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]);
+  T x0 = T{1} / (tau);
+  T x1 = T{0.0138888888888889}*x0;
+  T x2 = u_1 + u_2;
+  T x3 = T{9.00000000000000}*(x2*x2);
+  T x4 = T{6.00000000000000}*u_2;
+  T x5 = u_1*u_1;
+  T x6 = T{3.00000000000000}*x5;
+  T x7 = -x6;
+  T x8 = u_0*u_0;
+  T x9 = T{3.00000000000000}*x8;
+  T x10 = T{2.00000000000000} - x9;
+  T x11 = x10 + x7;
+  T x12 = x11 + x4;
+  T x13 = u_2*u_2;
+  T x14 = T{3.00000000000000}*x13;
+  T x15 = -x14;
+  T x16 = T{6.00000000000000}*u_1;
+  T x17 = x15 + x16;
+  T x18 = T{6.00000000000000}*u_0;
+  T x19 = -u_0;
+  T x20 = x19 + u_2;
+  T x21 = x14 + x6 + T{-2.00000000000000};
+  T x22 = x21 + x9;
+  T x23 = x22 - x4;
+  T x24 = T{0.0277777777777778}*x0;
+  T x25 = T{6.00000000000000}*x13;
+  T x26 = u_0 + u_2;
+  T x27 = T{9.00000000000000}*(x26*x26);
+  T x28 = x15 + x18;
+  T x29 = -u_1;
+  T x30 = x29 + u_2;
+  T x31 = x19 + u_1;
+  T x32 = -x16;
+  T x33 = x18 + x22;
+  T x34 = T{6.00000000000000}*x5;
+  T x35 = u_0 + u_1;
+  T x36 = T{9.00000000000000}*(x35*x35);
+  T x37 = T{6.00000000000000}*x8;
+  T x38 = x9 + T{-2.00000000000000};
+  T x39 = x29 + u_0;
+  T x40 = -x18 + x22;
+  T x41 = -u_2;
+  T x42 = x41 + u_1;
+  T x43 = x22 + x4;
+  T x44 = x41 + u_0;
+  f_next[0] = x1*(rho*(x12 + x17 + x3) - T{72.0000000000000}*f_curr[0]) + f_curr[0];
+  f_next[1] = -x1*(rho*(x18 + x23 - T{9.00000000000000}*x20*x20) + T{72.0000000000000}*f_curr[1]) + f_curr[1];
+  f_next[2] = x24*(rho*(x12 + x25) - T{36.0000000000000}*f_curr[2]) + f_curr[2];
+  f_next[3] = x1*(rho*(x12 + x27 + x28) - T{72.0000000000000}*f_curr[3]) + f_curr[3];
+  f_next[4] = -x1*(rho*(x16 + x23 - T{9.00000000000000}*x30*x30) + T{72.0000000000000}*f_curr[4]) + f_curr[4];
+  f_next[5] = -x1*(rho*(x32 + x33 - T{9.00000000000000}*x31*x31) + T{72.0000000000000}*f_curr[5]) + f_curr[5];
+  f_next[6] = x24*(rho*(x10 + x17 + x34) - T{36.0000000000000}*f_curr[6]) + f_curr[6];
+  f_next[7] = x1*(rho*(x11 + x17 + x18 + x36) - T{72.0000000000000}*f_curr[7]) + f_curr[7];
+  f_next[8] = -x24*(rho*(x18 + x21 - x37) + T{36.0000000000000}*f_curr[8]) + f_curr[8];
+  f_next[9] = -T{0.166666666666667}*x0*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9];
+  f_next[10] = x24*(rho*(x28 + x37 + x7 + T{2.00000000000000}) - T{36.0000000000000}*f_curr[10]) + f_curr[10];
+  f_next[11] = -x1*(rho*(x16 + x33 - x36) + T{72.0000000000000}*f_curr[11]) + f_curr[11];
+  f_next[12] = -x24*(rho*(x14 + x16 - x34 + x38) + T{36.0000000000000}*f_curr[12]) + f_curr[12];
+  f_next[13] = -x1*(rho*(x16 + x40 - T{9.00000000000000}*x39*x39) + T{72.0000000000000}*f_curr[13]) + f_curr[13];
+  f_next[14] = -x1*(rho*(x32 + x43 - T{9.00000000000000}*x42*x42) + T{72.0000000000000}*f_curr[14]) + f_curr[14];
+  f_next[15] = -x1*(rho*(-x27 + x33 + x4) + T{72.0000000000000}*f_curr[15]) + f_curr[15];
+  f_next[16] = -x24*(rho*(-x25 + x38 + x4 + x6) + T{36.0000000000000}*f_curr[16]) + f_curr[16];
+  f_next[17] = -x1*(rho*(x4 + x40 - T{9.00000000000000}*x44*x44) + T{72.0000000000000}*f_curr[17]) + f_curr[17];
+  f_next[18] = -x1*(rho*(x16 - x3 + x43) + T{72.0000000000000}*f_curr[18]) + f_curr[18];
+}
+
+};
diff --git a/tangle/LLBM/kernel/equilibrium_density_wall.h b/tangle/LLBM/kernel/equilibrium_density_wall.h
new file mode 100644
index 0000000..4f757f9
--- /dev/null
+++ b/tangle/LLBM/kernel/equilibrium_density_wall.h
@@ -0,0 +1,226 @@
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/wall.h>
+#include <LLBM/descriptor.h>
+
+struct EquilibriumDensityWallO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T rho_w, WallNormal<1,0>) {
+  T u = (rho_w - T{2.00000000000000}*f_curr[0] - T{2.00000000000000}*f_curr[1] - T{2.00000000000000}*f_curr[2] - f_curr[3] - f_curr[4] - f_curr[5])/rho_w;
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = T{3.00000000000000}*u_1;
+  T e2 = T{3.00000000000000}*u_0;
+  T e3 = u_0 + u_1;
+  T e4 = T{4.50000000000000}*(e3*e3);
+  T e5 = u_1*u_1;
+  T e6 = T{1.50000000000000}*e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = e8 + T{-1.00000000000000};
+  T e10 = e6 + e9;
+  T e11 = T{0.111111111111111}*rho;
+  T e12 = -e2;
+  T e13 = T{1.00000000000000} - e6;
+  T e14 = e13 + T{3.00000000000000}*e7;
+  T e15 = -e8;
+  T e16 = e1 + e15;
+  T e17 = u_0 - u_1;
+  T e18 = e13 + T{4.50000000000000}*(e17*e17);
+  T e19 = T{3.00000000000000}*e5;
+  f_next[0] = -e0*(e1 + e10 + e2 - e4);
+  f_next[1] = e11*(e12 + e14);
+  f_next[2] = e0*(e12 + e16 + e18);
+  f_next[3] = -e11*(e1 - e19 + e9);
+  f_next[4] = -T{0.444444444444444}*e10*rho;
+  f_next[5] = e11*(e16 + e19 + T{1.00000000000000});
+  f_next[6] = e0*(-e1 + e15 + e18 + e2);
+  f_next[7] = e11*(e14 + e2);
+  f_next[8] = e0*(e13 + e16 + e2 + e4);
+  
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T rho_w, WallNormal<-1,0>) {
+  T u = (-rho_w + f_curr[3] + f_curr[4] + f_curr[5] + T{2.00000000000000}*f_curr[6] + T{2.00000000000000}*f_curr[7] + T{2.00000000000000}*f_curr[8])/rho_w;
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = T{3.00000000000000}*u_1;
+  T e2 = T{3.00000000000000}*u_0;
+  T e3 = u_0 + u_1;
+  T e4 = T{4.50000000000000}*(e3*e3);
+  T e5 = u_1*u_1;
+  T e6 = T{1.50000000000000}*e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = e8 + T{-1.00000000000000};
+  T e10 = e6 + e9;
+  T e11 = T{0.111111111111111}*rho;
+  T e12 = -e2;
+  T e13 = T{1.00000000000000} - e6;
+  T e14 = e13 + T{3.00000000000000}*e7;
+  T e15 = -e8;
+  T e16 = e1 + e15;
+  T e17 = u_0 - u_1;
+  T e18 = e13 + T{4.50000000000000}*(e17*e17);
+  T e19 = T{3.00000000000000}*e5;
+  f_next[0] = -e0*(e1 + e10 + e2 - e4);
+  f_next[1] = e11*(e12 + e14);
+  f_next[2] = e0*(e12 + e16 + e18);
+  f_next[3] = -e11*(e1 - e19 + e9);
+  f_next[4] = -T{0.444444444444444}*e10*rho;
+  f_next[5] = e11*(e16 + e19 + T{1.00000000000000});
+  f_next[6] = e0*(-e1 + e15 + e18 + e2);
+  f_next[7] = e11*(e14 + e2);
+  f_next[8] = e0*(e13 + e16 + e2 + e4);
+  
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T rho_w, WallNormal<1,0,0>) {
+  T u = (rho_w - f_curr[0] - T{2.00000000000000}*f_curr[11] - f_curr[12] - f_curr[14] - T{2.00000000000000}*f_curr[15] - f_curr[16] - f_curr[18] - T{2.00000000000000}*f_curr[1] - f_curr[2] - f_curr[4] - T{2.00000000000000}*f_curr[5] - f_curr[6] - T{2.00000000000000}*f_curr[8] - f_curr[9])/rho_w;
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  T u_2 = 0.;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = u_1 + u_2;
+  T e2 = T{4.50000000000000}*(e1*e1);
+  T e3 = T{3.00000000000000}*u_2;
+  T e4 = u_1*u_1;
+  T e5 = T{1.50000000000000}*e4;
+  T e6 = -e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = T{1.00000000000000} - e8;
+  T e10 = e6 + e9;
+  T e11 = e10 + e3;
+  T e12 = T{3.00000000000000}*u_1;
+  T e13 = u_2*u_2;
+  T e14 = T{1.50000000000000}*e13;
+  T e15 = -e14;
+  T e16 = e12 + e15;
+  T e17 = T{3.00000000000000}*u_0;
+  T e18 = -u_2;
+  T e19 = e18 + u_0;
+  T e20 = -T{4.50000000000000}*e19*e19;
+  T e21 = e14 + e5 + T{-1.00000000000000};
+  T e22 = e21 + e8;
+  T e23 = e22 - e3;
+  T e24 = T{0.0555555555555556}*rho;
+  T e25 = T{3.00000000000000}*e13;
+  T e26 = u_0 + u_2;
+  T e27 = T{4.50000000000000}*(e26*e26);
+  T e28 = e15 + e17;
+  T e29 = e18 + u_1;
+  T e30 = -T{4.50000000000000}*e29*e29;
+  T e31 = -e12;
+  T e32 = u_0 - u_1;
+  T e33 = -T{4.50000000000000}*e32*e32;
+  T e34 = e17 + e22;
+  T e35 = T{3.00000000000000}*e4;
+  T e36 = u_0 + u_1;
+  T e37 = T{4.50000000000000}*(e36*e36);
+  T e38 = T{3.00000000000000}*e7;
+  T e39 = e8 + T{-1.00000000000000};
+  T e40 = -e17 + e22;
+  T e41 = e22 + e3;
+  f_next[0] = e0*(e11 + e16 + e2);
+  f_next[1] = -e0*(e17 + e20 + e23);
+  f_next[2] = e24*(e11 + e25);
+  f_next[3] = e0*(e11 + e27 + e28);
+  f_next[4] = -e0*(e12 + e23 + e30);
+  f_next[5] = -e0*(e31 + e33 + e34);
+  f_next[6] = e24*(e16 + e35 + e9);
+  f_next[7] = e0*(e10 + e16 + e17 + e37);
+  f_next[8] = -e24*(e17 + e21 - e38);
+  f_next[9] = -T{0.333333333333333}*e22*rho;
+  f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+  f_next[11] = -e0*(e12 + e34 - e37);
+  f_next[12] = -e24*(e12 + e14 - e35 + e39);
+  f_next[13] = -e0*(e12 + e33 + e40);
+  f_next[14] = -e0*(e30 + e31 + e41);
+  f_next[15] = -e0*(-e27 + e3 + e34);
+  f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+  f_next[17] = -e0*(e20 + e3 + e40);
+  f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T rho_w, WallNormal<-1,0,0>) {
+  T u = (-rho_w + f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/rho_w;
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  T u_2 = 0.;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = u_1 + u_2;
+  T e2 = T{4.50000000000000}*(e1*e1);
+  T e3 = T{3.00000000000000}*u_2;
+  T e4 = u_1*u_1;
+  T e5 = T{1.50000000000000}*e4;
+  T e6 = -e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = T{1.00000000000000} - e8;
+  T e10 = e6 + e9;
+  T e11 = e10 + e3;
+  T e12 = T{3.00000000000000}*u_1;
+  T e13 = u_2*u_2;
+  T e14 = T{1.50000000000000}*e13;
+  T e15 = -e14;
+  T e16 = e12 + e15;
+  T e17 = T{3.00000000000000}*u_0;
+  T e18 = -u_2;
+  T e19 = e18 + u_0;
+  T e20 = -T{4.50000000000000}*e19*e19;
+  T e21 = e14 + e5 + T{-1.00000000000000};
+  T e22 = e21 + e8;
+  T e23 = e22 - e3;
+  T e24 = T{0.0555555555555556}*rho;
+  T e25 = T{3.00000000000000}*e13;
+  T e26 = u_0 + u_2;
+  T e27 = T{4.50000000000000}*(e26*e26);
+  T e28 = e15 + e17;
+  T e29 = e18 + u_1;
+  T e30 = -T{4.50000000000000}*e29*e29;
+  T e31 = -e12;
+  T e32 = u_0 - u_1;
+  T e33 = -T{4.50000000000000}*e32*e32;
+  T e34 = e17 + e22;
+  T e35 = T{3.00000000000000}*e4;
+  T e36 = u_0 + u_1;
+  T e37 = T{4.50000000000000}*(e36*e36);
+  T e38 = T{3.00000000000000}*e7;
+  T e39 = e8 + T{-1.00000000000000};
+  T e40 = -e17 + e22;
+  T e41 = e22 + e3;
+  f_next[0] = e0*(e11 + e16 + e2);
+  f_next[1] = -e0*(e17 + e20 + e23);
+  f_next[2] = e24*(e11 + e25);
+  f_next[3] = e0*(e11 + e27 + e28);
+  f_next[4] = -e0*(e12 + e23 + e30);
+  f_next[5] = -e0*(e31 + e33 + e34);
+  f_next[6] = e24*(e16 + e35 + e9);
+  f_next[7] = e0*(e10 + e16 + e17 + e37);
+  f_next[8] = -e24*(e17 + e21 - e38);
+  f_next[9] = -T{0.333333333333333}*e22*rho;
+  f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+  f_next[11] = -e0*(e12 + e34 - e37);
+  f_next[12] = -e24*(e12 + e14 - e35 + e39);
+  f_next[13] = -e0*(e12 + e33 + e40);
+  f_next[14] = -e0*(e30 + e31 + e41);
+  f_next[15] = -e0*(-e27 + e3 + e34);
+  f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+  f_next[17] = -e0*(e20 + e3 + e40);
+  f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+};
diff --git a/tangle/LLBM/kernel/equilibrium_velocity_wall.h b/tangle/LLBM/kernel/equilibrium_velocity_wall.h
new file mode 100644
index 0000000..23354dc
--- /dev/null
+++ b/tangle/LLBM/kernel/equilibrium_velocity_wall.h
@@ -0,0 +1,222 @@
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/wall.h>
+#include <LLBM/descriptor.h>
+
+struct EquilibriumVelocityWallO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T u_w, WallNormal<1,0>) {
+  T rho = -(T{2.00000000000000}*f_curr[0] + T{2.00000000000000}*f_curr[1] + T{2.00000000000000}*f_curr[2] + f_curr[3] + f_curr[4] + f_curr[5])/(u_w + T{-1.00000000000000});
+  T u_0 = u_w;
+  T u_1 = 0.;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = T{3.00000000000000}*u_1;
+  T e2 = T{3.00000000000000}*u_0;
+  T e3 = u_0 + u_1;
+  T e4 = T{4.50000000000000}*(e3*e3);
+  T e5 = u_1*u_1;
+  T e6 = T{1.50000000000000}*e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = e8 + T{-1.00000000000000};
+  T e10 = e6 + e9;
+  T e11 = T{0.111111111111111}*rho;
+  T e12 = -e2;
+  T e13 = T{1.00000000000000} - e6;
+  T e14 = e13 + T{3.00000000000000}*e7;
+  T e15 = -e8;
+  T e16 = e1 + e15;
+  T e17 = u_0 - u_1;
+  T e18 = e13 + T{4.50000000000000}*(e17*e17);
+  T e19 = T{3.00000000000000}*e5;
+  f_next[0] = -e0*(e1 + e10 + e2 - e4);
+  f_next[1] = e11*(e12 + e14);
+  f_next[2] = e0*(e12 + e16 + e18);
+  f_next[3] = -e11*(e1 - e19 + e9);
+  f_next[4] = -T{0.444444444444444}*e10*rho;
+  f_next[5] = e11*(e16 + e19 + T{1.00000000000000});
+  f_next[6] = e0*(-e1 + e15 + e18 + e2);
+  f_next[7] = e11*(e14 + e2);
+  f_next[8] = e0*(e13 + e16 + e2 + e4);
+  
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T u_w, WallNormal<-1,0>) {
+  T rho = (f_curr[3] + f_curr[4] + f_curr[5] + T{2.00000000000000}*f_curr[6] + T{2.00000000000000}*f_curr[7] + T{2.00000000000000}*f_curr[8])/(u_w + T{1.00000000000000});
+  T u_0 = u_w;
+  T u_1 = 0;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = T{3.00000000000000}*u_1;
+  T e2 = T{3.00000000000000}*u_0;
+  T e3 = u_0 + u_1;
+  T e4 = T{4.50000000000000}*(e3*e3);
+  T e5 = u_1*u_1;
+  T e6 = T{1.50000000000000}*e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = e8 + T{-1.00000000000000};
+  T e10 = e6 + e9;
+  T e11 = T{0.111111111111111}*rho;
+  T e12 = -e2;
+  T e13 = T{1.00000000000000} - e6;
+  T e14 = e13 + T{3.00000000000000}*e7;
+  T e15 = -e8;
+  T e16 = e1 + e15;
+  T e17 = u_0 - u_1;
+  T e18 = e13 + T{4.50000000000000}*(e17*e17);
+  T e19 = T{3.00000000000000}*e5;
+  f_next[0] = -e0*(e1 + e10 + e2 - e4);
+  f_next[1] = e11*(e12 + e14);
+  f_next[2] = e0*(e12 + e16 + e18);
+  f_next[3] = -e11*(e1 - e19 + e9);
+  f_next[4] = -T{0.444444444444444}*e10*rho;
+  f_next[5] = e11*(e16 + e19 + T{1.00000000000000});
+  f_next[6] = e0*(-e1 + e15 + e18 + e2);
+  f_next[7] = e11*(e14 + e2);
+  f_next[8] = e0*(e13 + e16 + e2 + e4);
+  
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T u_w, WallNormal<1,0,0>) {
+  T rho = -(f_curr[0] + T{2.00000000000000}*f_curr[11] + f_curr[12] + f_curr[14] + T{2.00000000000000}*f_curr[15] + f_curr[16] + f_curr[18] + T{2.00000000000000}*f_curr[1] + f_curr[2] + f_curr[4] + T{2.00000000000000}*f_curr[5] + f_curr[6] + T{2.00000000000000}*f_curr[8] + f_curr[9])/(u_w + T{-1.00000000000000});
+  T u_0 = u_w;
+  T u_1 = 0;
+  T u_2 = 0;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = u_1 + u_2;
+  T e2 = T{4.50000000000000}*(e1*e1);
+  T e3 = T{3.00000000000000}*u_2;
+  T e4 = u_1*u_1;
+  T e5 = T{1.50000000000000}*e4;
+  T e6 = -e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = T{1.00000000000000} - e8;
+  T e10 = e6 + e9;
+  T e11 = e10 + e3;
+  T e12 = T{3.00000000000000}*u_1;
+  T e13 = u_2*u_2;
+  T e14 = T{1.50000000000000}*e13;
+  T e15 = -e14;
+  T e16 = e12 + e15;
+  T e17 = T{3.00000000000000}*u_0;
+  T e18 = -u_2;
+  T e19 = e18 + u_0;
+  T e20 = -T{4.50000000000000}*e19*e19;
+  T e21 = e14 + e5 + T{-1.00000000000000};
+  T e22 = e21 + e8;
+  T e23 = e22 - e3;
+  T e24 = T{0.0555555555555556}*rho;
+  T e25 = T{3.00000000000000}*e13;
+  T e26 = u_0 + u_2;
+  T e27 = T{4.50000000000000}*(e26*e26);
+  T e28 = e15 + e17;
+  T e29 = e18 + u_1;
+  T e30 = -T{4.50000000000000}*e29*e29;
+  T e31 = -e12;
+  T e32 = u_0 - u_1;
+  T e33 = -T{4.50000000000000}*e32*e32;
+  T e34 = e17 + e22;
+  T e35 = T{3.00000000000000}*e4;
+  T e36 = u_0 + u_1;
+  T e37 = T{4.50000000000000}*(e36*e36);
+  T e38 = T{3.00000000000000}*e7;
+  T e39 = e8 + T{-1.00000000000000};
+  T e40 = -e17 + e22;
+  T e41 = e22 + e3;
+  f_next[0] = e0*(e11 + e16 + e2);
+  f_next[1] = -e0*(e17 + e20 + e23);
+  f_next[2] = e24*(e11 + e25);
+  f_next[3] = e0*(e11 + e27 + e28);
+  f_next[4] = -e0*(e12 + e23 + e30);
+  f_next[5] = -e0*(e31 + e33 + e34);
+  f_next[6] = e24*(e16 + e35 + e9);
+  f_next[7] = e0*(e10 + e16 + e17 + e37);
+  f_next[8] = -e24*(e17 + e21 - e38);
+  f_next[9] = -T{0.333333333333333}*e22*rho;
+  f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+  f_next[11] = -e0*(e12 + e34 - e37);
+  f_next[12] = -e24*(e12 + e14 - e35 + e39);
+  f_next[13] = -e0*(e12 + e33 + e40);
+  f_next[14] = -e0*(e30 + e31 + e41);
+  f_next[15] = -e0*(-e27 + e3 + e34);
+  f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+  f_next[17] = -e0*(e20 + e3 + e40);
+  f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T u_w, WallNormal<-1,0,0>) {
+  T rho = (f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/(u_w + T{1.00000000000000});
+  T u_0 = u_w;
+  T u_1 = 0;
+  T u_2 = 0;
+  T e0 = T{0.0277777777777778}*rho;
+  T e1 = u_1 + u_2;
+  T e2 = T{4.50000000000000}*(e1*e1);
+  T e3 = T{3.00000000000000}*u_2;
+  T e4 = u_1*u_1;
+  T e5 = T{1.50000000000000}*e4;
+  T e6 = -e5;
+  T e7 = u_0*u_0;
+  T e8 = T{1.50000000000000}*e7;
+  T e9 = T{1.00000000000000} - e8;
+  T e10 = e6 + e9;
+  T e11 = e10 + e3;
+  T e12 = T{3.00000000000000}*u_1;
+  T e13 = u_2*u_2;
+  T e14 = T{1.50000000000000}*e13;
+  T e15 = -e14;
+  T e16 = e12 + e15;
+  T e17 = T{3.00000000000000}*u_0;
+  T e18 = -u_2;
+  T e19 = e18 + u_0;
+  T e20 = -T{4.50000000000000}*e19*e19;
+  T e21 = e14 + e5 + T{-1.00000000000000};
+  T e22 = e21 + e8;
+  T e23 = e22 - e3;
+  T e24 = T{0.0555555555555556}*rho;
+  T e25 = T{3.00000000000000}*e13;
+  T e26 = u_0 + u_2;
+  T e27 = T{4.50000000000000}*(e26*e26);
+  T e28 = e15 + e17;
+  T e29 = e18 + u_1;
+  T e30 = -T{4.50000000000000}*e29*e29;
+  T e31 = -e12;
+  T e32 = u_0 - u_1;
+  T e33 = -T{4.50000000000000}*e32*e32;
+  T e34 = e17 + e22;
+  T e35 = T{3.00000000000000}*e4;
+  T e36 = u_0 + u_1;
+  T e37 = T{4.50000000000000}*(e36*e36);
+  T e38 = T{3.00000000000000}*e7;
+  T e39 = e8 + T{-1.00000000000000};
+  T e40 = -e17 + e22;
+  T e41 = e22 + e3;
+  f_next[0] = e0*(e11 + e16 + e2);
+  f_next[1] = -e0*(e17 + e20 + e23);
+  f_next[2] = e24*(e11 + e25);
+  f_next[3] = e0*(e11 + e27 + e28);
+  f_next[4] = -e0*(e12 + e23 + e30);
+  f_next[5] = -e0*(e31 + e33 + e34);
+  f_next[6] = e24*(e16 + e35 + e9);
+  f_next[7] = e0*(e10 + e16 + e17 + e37);
+  f_next[8] = -e24*(e17 + e21 - e38);
+  f_next[9] = -T{0.333333333333333}*e22*rho;
+  f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+  f_next[11] = -e0*(e12 + e34 - e37);
+  f_next[12] = -e24*(e12 + e14 - e35 + e39);
+  f_next[13] = -e0*(e12 + e33 + e40);
+  f_next[14] = -e0*(e30 + e31 + e41);
+  f_next[15] = -e0*(-e27 + e3 + e34);
+  f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+  f_next[17] = -e0*(e20 + e3 + e40);
+  f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+};
diff --git a/tangle/LLBM/kernel/executor.h b/tangle/LLBM/kernel/executor.h
new file mode 100644
index 0000000..942918d
--- /dev/null
+++ b/tangle/LLBM/kernel/executor.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <LLBM/operator.h>
+
+namespace kernel {
+
+template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_operator(
+    LatticeView<DESCRIPTOR,S> lattice
+  , std::size_t* cells
+  , std::size_t  cell_count
+  , ARGS... args
+) {
+  const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (!(index < cell_count)) {
+      return;
+  }
+  const std::size_t gid = cells[index];
+
+  S f_curr[DESCRIPTOR::q];
+  S f_next[DESCRIPTOR::q];
+  S* preshifted_f[DESCRIPTOR::q];
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    preshifted_f[iPop] = lattice.pop(iPop, gid);
+    f_curr[iPop] = *preshifted_f[iPop];
+  }
+  OPERATOR::template apply<T,S>(DESCRIPTOR(), f_curr, f_next, gid, std::forward<ARGS>(args)...);
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    *preshifted_f[iPop] = f_next[iPop];
+  }
+}
+
+template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_operator(
+    LatticeView<DESCRIPTOR,S> lattice
+  , bool* mask
+  , ARGS... args
+) {
+  const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (!(gid < lattice.cuboid.volume) || !mask[gid]) {
+      return;
+  }
+
+  S f_curr[DESCRIPTOR::q];
+  S f_next[DESCRIPTOR::q];
+  S* preshifted_f[DESCRIPTOR::q];
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    preshifted_f[iPop] = lattice.pop(iPop, gid);
+    f_curr[iPop] = *preshifted_f[iPop];
+  }
+  OPERATOR::template apply<T,S>(DESCRIPTOR(), f_curr, f_next, gid, std::forward<ARGS>(args)...);
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    *preshifted_f[iPop] = f_next[iPop];
+  }
+}
+
+template <typename FUNCTOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_functor(
+    LatticeView<DESCRIPTOR,S> lattice
+  , bool* mask
+  , ARGS... args
+) {
+  const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (!(gid < lattice.cuboid.volume) || !mask[gid]) {
+      return;
+  }
+
+  S f_curr[DESCRIPTOR::q];
+  S* preshifted_f[DESCRIPTOR::q];
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    preshifted_f[iPop] = lattice.pop(iPop, gid);
+    f_curr[iPop] = *preshifted_f[iPop];
+  }
+  FUNCTOR::template apply<T,S>(DESCRIPTOR(), f_curr, gid, std::forward<ARGS>(args)...);
+}
+
+template <typename DESCRIPTOR, typename T, typename S, typename... OPERATOR>
+__global__ void call_operators(
+    LatticeView<DESCRIPTOR,S> lattice
+  , OPERATOR... ops
+) {
+  const std::size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (!(gid < lattice.cuboid.volume)) {
+      return;
+  }
+
+  S f_curr[DESCRIPTOR::q];
+  S f_next[DESCRIPTOR::q];
+  S* preshifted_f[DESCRIPTOR::q];
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    preshifted_f[iPop] = lattice.pop(iPop, gid);
+    f_curr[iPop] = *preshifted_f[iPop];
+  }
+  (ops.template apply<DESCRIPTOR,T,S>(DESCRIPTOR(), f_curr, f_next, gid) || ... || false);
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    *preshifted_f[iPop] = f_next[iPop];
+  }
+}
+
+template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_operator_using_list(
+    LatticeView<DESCRIPTOR,S> lattice
+  , std::size_t count
+  , ARGS... args
+) {
+  const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (!(index < count)) {
+      return;
+  }
+  OPERATOR::template apply<T,S>(lattice, index, count, std::forward<ARGS>(args)...);
+}
+
+template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_operator_using_list(
+    DESCRIPTOR descriptor
+  , std::size_t count
+  , ARGS... args
+) {
+  const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (!(index < count)) {
+      return;
+  }
+  OPERATOR::template apply<T,S>(descriptor, index, count, std::forward<ARGS>(args)...);
+}
+
+template <typename FUNCTOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_spatial_functor(
+    LatticeView<DESCRIPTOR,S> lattice
+  , bool* mask
+  , ARGS... args
+) {
+  const std::size_t iX = blockIdx.x * blockDim.x + threadIdx.x;
+  const std::size_t iY = blockIdx.y * blockDim.y + threadIdx.y;
+  const std::size_t iZ = blockIdx.z * blockDim.z + threadIdx.z;
+  if (!(iX < lattice.cuboid.nX && iY < lattice.cuboid.nY && iZ < lattice.cuboid.nZ)) {
+      return;
+  }
+  const std::size_t gid = descriptor::gid(lattice.cuboid,iX,iY,iZ);
+  if (!mask[gid]) {
+      return;
+  }
+
+  S f_curr[DESCRIPTOR::q];
+  S* preshifted_f[DESCRIPTOR::q];
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    preshifted_f[iPop] = lattice.pop(iPop, gid);
+    f_curr[iPop] = *preshifted_f[iPop];
+  }
+  FUNCTOR::template apply<T,S>(DESCRIPTOR(), f_curr, lattice.cuboid, gid, iX, iY, iZ, std::forward<ARGS>(args)...);
+}
+
+template <typename OPERATOR, typename DESCRIPTOR, typename T, typename S, typename... ARGS>
+__global__ void call_spatial_operator(
+    descriptor::Cuboid<DESCRIPTOR> cuboid
+  , bool* mask
+  , ARGS... args
+) {
+  const std::size_t iX = blockIdx.x * blockDim.x + threadIdx.x;
+  const std::size_t iY = blockIdx.y * blockDim.y + threadIdx.y;
+  const std::size_t iZ = blockIdx.z * blockDim.z + threadIdx.z;
+  if (!(iX < cuboid.nX && iY < cuboid.nY && iZ < cuboid.nZ)) {
+      return;
+  }
+  const std::size_t gid = descriptor::gid(cuboid,iX,iY,iZ);
+  if (!mask[gid]) {
+      return;
+  }
+  OPERATOR::template apply<T,S>(DESCRIPTOR(), gid, iX, iY, iZ, std::forward<ARGS>(args)...);
+}
+
+}
diff --git a/tangle/LLBM/kernel/free_slip.h b/tangle/LLBM/kernel/free_slip.h
new file mode 100644
index 0000000..25d7344
--- /dev/null
+++ b/tangle/LLBM/kernel/free_slip.h
@@ -0,0 +1,82 @@
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/wall.h>
+#include <LLBM/descriptor.h>
+
+struct BounceBackFreeSlipO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, WallNormal<1,0>) {
+  f_next[0] = f_curr[6];
+  f_next[1] = f_curr[7];
+  f_next[2] = f_curr[8];
+  f_next[3] = f_curr[3];
+  f_next[4] = f_curr[4];
+  f_next[5] = f_curr[5];
+  f_next[6] = f_curr[0];
+  f_next[7] = f_curr[1];
+  f_next[8] = f_curr[2];
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, WallNormal<0,1>) {
+  f_next[0] = f_curr[2];
+  f_next[1] = f_curr[1];
+  f_next[2] = f_curr[0];
+  f_next[3] = f_curr[5];
+  f_next[4] = f_curr[4];
+  f_next[5] = f_curr[3];
+  f_next[6] = f_curr[8];
+  f_next[7] = f_curr[7];
+  f_next[8] = f_curr[6];
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, WallNormal<0,1,0>) {
+  f_next[0] = f_curr[4];
+  f_next[1] = f_curr[1];
+  f_next[2] = f_curr[2];
+  f_next[3] = f_curr[3];
+  f_next[4] = f_curr[0];
+  f_next[5] = f_curr[11];
+  f_next[6] = f_curr[12];
+  f_next[7] = f_curr[13];
+  f_next[8] = f_curr[8];
+  f_next[9] = f_curr[9];
+  f_next[10] = f_curr[10];
+  f_next[11] = f_curr[5];
+  f_next[12] = f_curr[6];
+  f_next[13] = f_curr[7];
+  f_next[14] = f_curr[18];
+  f_next[15] = f_curr[15];
+  f_next[16] = f_curr[16];
+  f_next[17] = f_curr[17];
+  f_next[18] = f_curr[14];
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, WallNormal<0,0,1>) {
+  f_next[0] = f_curr[14];
+  f_next[1] = f_curr[15];
+  f_next[2] = f_curr[16];
+  f_next[3] = f_curr[17];
+  f_next[4] = f_curr[18];
+  f_next[5] = f_curr[5];
+  f_next[6] = f_curr[6];
+  f_next[7] = f_curr[7];
+  f_next[8] = f_curr[8];
+  f_next[9] = f_curr[9];
+  f_next[10] = f_curr[10];
+  f_next[11] = f_curr[11];
+  f_next[12] = f_curr[12];
+  f_next[13] = f_curr[13];
+  f_next[14] = f_curr[0];
+  f_next[15] = f_curr[1];
+  f_next[16] = f_curr[2];
+  f_next[17] = f_curr[3];
+  f_next[18] = f_curr[4];
+}
+
+};
diff --git a/tangle/LLBM/kernel/initialize.h b/tangle/LLBM/kernel/initialize.h
new file mode 100644
index 0000000..221b9ad
--- /dev/null
+++ b/tangle/LLBM/kernel/initialize.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct InitializeO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) {
+  f_next[0] = T{0.0277777777777778};
+  f_next[1] = T{0.111111111111111};
+  f_next[2] = T{0.0277777777777778};
+  f_next[3] = T{0.111111111111111};
+  f_next[4] = T{0.444444444444444};
+  f_next[5] = T{0.111111111111111};
+  f_next[6] = T{0.0277777777777778};
+  f_next[7] = T{0.111111111111111};
+  f_next[8] = T{0.0277777777777778};
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) {
+  f_next[0] = T{0.0277777777777778};
+  f_next[1] = T{0.0277777777777778};
+  f_next[2] = T{0.0555555555555556};
+  f_next[3] = T{0.0277777777777778};
+  f_next[4] = T{0.0277777777777778};
+  f_next[5] = T{0.0277777777777778};
+  f_next[6] = T{0.0555555555555556};
+  f_next[7] = T{0.0277777777777778};
+  f_next[8] = T{0.0555555555555556};
+  f_next[9] = T{0.333333333333333};
+  f_next[10] = T{0.0555555555555556};
+  f_next[11] = T{0.0277777777777778};
+  f_next[12] = T{0.0555555555555556};
+  f_next[13] = T{0.0277777777777778};
+  f_next[14] = T{0.0277777777777778};
+  f_next[15] = T{0.0277777777777778};
+  f_next[16] = T{0.0555555555555556};
+  f_next[17] = T{0.0277777777777778};
+  f_next[18] = T{0.0277777777777778};
+}
+
+};
diff --git a/tangle/LLBM/kernel/propagate.h b/tangle/LLBM/kernel/propagate.h
new file mode 100644
index 0000000..08f78ea
--- /dev/null
+++ b/tangle/LLBM/kernel/propagate.h
@@ -0,0 +1,20 @@
+#pragma once
+
+template <typename DESCRIPTOR, typename S>
+class LatticeView;
+
+template <typename DESCRIPTOR, typename S>
+__global__ void propagate(LatticeView<DESCRIPTOR,S> lattice, S** base, std::size_t size) {
+
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    std::ptrdiff_t shift = -descriptor::offset<DESCRIPTOR>(lattice.cuboid, iPop);
+
+    lattice.population[iPop] += shift;
+
+    if (lattice.population[iPop] < base[iPop]) {
+      lattice.population[iPop] += size;
+    } else if (lattice.population[iPop] + size > base[iPop] + 2*size) {
+      lattice.population[iPop] -= size;
+    }
+  }
+}
diff --git a/tangle/LLBM/kernel/smagorinsky_collide.h b/tangle/LLBM/kernel/smagorinsky_collide.h
new file mode 100644
index 0000000..3489479
--- /dev/null
+++ b/tangle/LLBM/kernel/smagorinsky_collide.h
@@ -0,0 +1,183 @@
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct SmagorinskyBgkCollideO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T tau, T smagorinsky) {
+  T x0 = f_curr[1] + f_curr[2];
+  T x1 = f_curr[3] + f_curr[6];
+  T x2 = x0 + x1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8];
+  T x3 = f_curr[0] - f_curr[8];
+  T x4 = T{1} / (x2);
+  T x5 = T{72.0000000000000}*f_curr[2];
+  T x6 = T{72.0000000000000}*f_curr[6];
+  T rho = x2;
+  T x31 = T{4.00000000000000}*rho;
+  T x40 = T{2.00000000000000}*rho;
+  T u_0 = -x4*(x0 + x3 - f_curr[6] - f_curr[7]);
+  T x7 = T{6.00000000000000}*u_0;
+  T x8 = -x7;
+  T x15 = u_0*u_0;
+  T x16 = T{3.00000000000000}*x15;
+  T x17 = -x16;
+  T x27 = x16 + T{-2.00000000000000};
+  T u_1 = -x4*(x1 + x3 - f_curr[2] - f_curr[5]);
+  T x9 = u_0 - u_1;
+  T x10 = T{9.00000000000000}*(x9*x9);
+  T x11 = u_1*u_1;
+  T x12 = T{3.00000000000000}*x11;
+  T x13 = T{2.00000000000000} - x12;
+  T x14 = T{6.00000000000000}*u_1;
+  T x18 = x14 + x17;
+  T x19 = x13 + x18;
+  T x20 = x10 + x19 + x8;
+  T x21 = rho*x20;
+  T x22 = x10 + x13 - x14 + x17 + x7;
+  T x23 = rho*x22;
+  T x24 = u_0 + u_1;
+  T x25 = T{9.00000000000000}*(x24*x24);
+  T x26 = x19 + x25 + x7;
+  T x28 = x12 + x27;
+  T x29 = x14 - x25 + x28 + x7;
+  T x30 = rho*x26 - rho*x29 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8];
+  T x32 = x13 + T{6.00000000000000}*x15;
+  T x33 = x32 + x8;
+  T x34 = x32 + x7;
+  T x35 = x21 + x23 + x30 - x5 - x6;
+  T x36 = T{6.00000000000000}*x11;
+  T x37 = x14 + x27 - x36;
+  T x38 = x18 + x36 + T{2.00000000000000};
+  T x39 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt((-x21 - x23 + x30 + x5 + x6)*(-x21 - x23 + x30 + x5 + x6) + T{0.500000000000000}*((x31*x33 + x31*x34 + x35 - 72*f_curr[1] - 72*f_curr[7])*(x31*x33 + x31*x34 + x35 - 72*f_curr[1] - 72*f_curr[7])) + T{0.500000000000000}*((-x31*x37 + x31*x38 + x35 - 72*f_curr[3] - 72*f_curr[5])*(-x31*x37 + x31*x38 + x35 - 72*f_curr[3] - 72*f_curr[5]))) + tau*tau));
+  f_next[0] = -T{0.0138888888888889}*x39*(x29*x40 + T{144.000000000000}*f_curr[0]) + f_curr[0];
+  f_next[1] = T{0.0555555555555556}*x39*(x33*x40 - T{36.0000000000000}*f_curr[1]) + f_curr[1];
+  f_next[2] = T{0.0138888888888889}*x39*(x20*x40 - T{144.000000000000}*f_curr[2]) + f_curr[2];
+  f_next[3] = -T{0.0555555555555556}*x39*(x37*x40 + T{36.0000000000000}*f_curr[3]) + f_curr[3];
+  f_next[4] = -T{0.111111111111111}*x39*(T{4.00000000000000}*rho*x28 + T{18.0000000000000}*f_curr[4]) + f_curr[4];
+  f_next[5] = T{0.0555555555555556}*x39*(x38*x40 - T{36.0000000000000}*f_curr[5]) + f_curr[5];
+  f_next[6] = T{0.0138888888888889}*x39*(x22*x40 - T{144.000000000000}*f_curr[6]) + f_curr[6];
+  f_next[7] = T{0.0555555555555556}*x39*(x34*x40 - T{36.0000000000000}*f_curr[7]) + f_curr[7];
+  f_next[8] = T{0.0138888888888889}*x39*(x26*x40 - T{144.000000000000}*f_curr[8]) + f_curr[8];
+  
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T tau, T smagorinsky) {
+  T x0 = f_curr[10] + f_curr[13] + f_curr[17];
+  T x1 = f_curr[14] + f_curr[5] + f_curr[6];
+  T x2 = f_curr[1] + f_curr[2] + f_curr[4];
+  T x3 = x0 + x1 + x2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+  T x4 = -f_curr[11] + f_curr[7];
+  T x5 = -f_curr[15] + f_curr[3];
+  T x6 = T{1} / (x3);
+  T x7 = f_curr[0] - f_curr[18];
+  T x8 = T{72.0000000000000}*f_curr[5];
+  T x9 = T{72.0000000000000}*f_curr[13];
+  T x42 = T{72.0000000000000}*f_curr[1];
+  T x43 = T{72.0000000000000}*f_curr[17];
+  T x61 = T{72.0000000000000}*f_curr[4];
+  T x62 = T{72.0000000000000}*f_curr[14];
+  T rho = x3;
+  T x74 = T{2.00000000000000}*rho;
+  T x89 = T{2.00000000000000}*rho;
+  T u_0 = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]);
+  T x12 = -u_0;
+  T x14 = T{6.00000000000000}*u_0;
+  T x17 = u_0*u_0;
+  T x18 = T{3.00000000000000}*x17;
+  T x26 = -x14;
+  T x35 = T{2.00000000000000} - x18;
+  T x75 = T{6.00000000000000}*x17;
+  T u_1 = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]);
+  T x10 = T{6.00000000000000}*u_1;
+  T x11 = -x10;
+  T x13 = x12 + u_1;
+  T x19 = u_1*u_1;
+  T x20 = T{3.00000000000000}*x19;
+  T x21 = x18 + x20 + T{-2.00000000000000};
+  T x27 = -u_1;
+  T x28 = x27 + u_0;
+  T x32 = u_0 + u_1;
+  T x33 = T{9.00000000000000}*(x32*x32);
+  T x34 = -x20;
+  T x36 = x34 + x35;
+  T x81 = T{6.00000000000000}*x19;
+  T u_2 = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]);
+  T x15 = u_2*u_2;
+  T x16 = T{3.00000000000000}*x15;
+  T x22 = x16 + x21;
+  T x23 = x14 + x22;
+  T x24 = x11 + x23 - T{9.00000000000000}*x13*x13;
+  T x25 = rho*x24;
+  T x29 = x10 + x22;
+  T x30 = x26 + x29 - T{9.00000000000000}*x28*x28;
+  T x31 = rho*x30;
+  T x37 = -x16;
+  T x38 = x10 + x37;
+  T x39 = x14 + x33 + x36 + x38;
+  T x40 = x14 + x29 - x33;
+  T x41 = rho*x39 - rho*x40 - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
+  T x44 = T{6.00000000000000}*u_2;
+  T x45 = -x44;
+  T x46 = x12 + u_2;
+  T x47 = x23 + x45 - T{9.00000000000000}*x46*x46;
+  T x48 = rho*x47;
+  T x49 = -u_2;
+  T x50 = x49 + u_0;
+  T x51 = x22 + x44;
+  T x52 = x26 + x51 - T{9.00000000000000}*x50*x50;
+  T x53 = rho*x52;
+  T x54 = u_0 + u_2;
+  T x55 = T{9.00000000000000}*(x54*x54);
+  T x56 = x36 + x44;
+  T x57 = x14 + x37;
+  T x58 = x55 + x56 + x57;
+  T x59 = x14 + x51 - x55;
+  T x60 = rho*x58 - rho*x59 - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
+  T x63 = x27 + u_2;
+  T x64 = x29 + x45 - T{9.00000000000000}*x63*x63;
+  T x65 = rho*x64;
+  T x66 = x49 + u_1;
+  T x67 = x11 + x51 - T{9.00000000000000}*x66*x66;
+  T x68 = rho*x67;
+  T x69 = u_1 + u_2;
+  T x70 = T{9.00000000000000}*(x69*x69);
+  T x71 = x38 + x56 + x70;
+  T x72 = x29 + x44 - x70;
+  T x73 = rho*x71 - rho*x72 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
+  T x76 = x16 + T{-2.00000000000000};
+  T x77 = x14 + x20 - x75 + x76;
+  T x78 = x34 + x57 + x75 + T{2.00000000000000};
+  T x79 = -x42 - x43 - x48 - x53 + x60;
+  T x80 = -x25 - x31 + x41 - x8 - x9;
+  T x82 = x10 + x18 + x76 - x81;
+  T x83 = x35 + x38 + x81;
+  T x84 = -x61 - x62 - x65 - x68 + x73;
+  T x85 = T{6.00000000000000}*x15;
+  T x86 = x21 + x44 - x85;
+  T x87 = x56 + x85;
+  T x88 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt((x25 + x31 + x41 + x8 + x9)*(x25 + x31 + x41 + x8 + x9) + (x42 + x43 + x48 + x53 + x60)*(x42 + x43 + x48 + x53 + x60) + (x61 + x62 + x65 + x68 + x73)*(x61 + x62 + x65 + x68 + x73) + T{0.500000000000000}*((-x74*x77 + x74*x78 + x79 + x80 - 72*f_curr[10] - 72*f_curr[8])*(-x74*x77 + x74*x78 + x79 + x80 - 72*f_curr[10] - 72*f_curr[8])) + T{0.500000000000000}*((-x74*x82 + x74*x83 + x80 + x84 - 72*f_curr[12] - 72*f_curr[6])*(-x74*x82 + x74*x83 + x80 + x84 - 72*f_curr[12] - 72*f_curr[6])) + T{0.500000000000000}*((-x74*x86 + x74*x87 + x79 + x84 - 72*f_curr[16] - 72*f_curr[2])*(-x74*x86 + x74*x87 + x79 + x84 - 72*f_curr[16] - 72*f_curr[2]))) + tau*tau));
+  f_next[0] = T{0.0138888888888889}*x88*(x71*x89 - T{144.000000000000}*f_curr[0]) + f_curr[0];
+  f_next[1] = -T{0.0138888888888889}*x88*(x47*x89 + T{144.000000000000}*f_curr[1]) + f_curr[1];
+  f_next[2] = T{0.0277777777777778}*x88*(x87*x89 - T{72.0000000000000}*f_curr[2]) + f_curr[2];
+  f_next[3] = T{0.0138888888888889}*x88*(x58*x89 - T{144.000000000