From 4ec94c97879aafef15f7663135745e4ba61e62cf Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Mon, 17 May 2021 00:15:33 +0200
Subject: Extract first public LiterateLB version

---
 lbm.org | 5817 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 5817 insertions(+)
 create mode 100644 lbm.org

(limited to 'lbm.org')
diff --git a/lbm.org b/lbm.org
new file mode 100644
index 0000000..6df9a50
--- /dev/null
+++ b/lbm.org
@@ -0,0 +1,5817 @@
+#+TITLE: A literate Lattice Boltzmann Code
+#+SUBTITLE: [[https://kummerlaender.eu][Adrian Kummerländer]]
+#+STARTUP: latexpreview
+#+HTML_DOCTYPE: html5
+#+OPTIONS: toc:nil html-postamble:nil html5-fancy:t html-style:nil
+#+PROPERTY: header-args :exports both :mkdirp yes :noweb no-export :eval no-export
+#+PROPERTY: header-args:python+ :var lattice="D2Q9"
+#+PROPERTY: header-args:cpp+ :main no :eval no
+#+HTML_MATHJAX: path:"https://static.kummerlaender.eu/mathjax/MathJax.js?config=TeX-AMS_HTML"
+
+#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="org.css"/>
+
+#+BEGIN_ABSTRACT
+This file describes a full Lattice Boltzmann code featuring both 2D and 3D lattices, a workable selection of boundary conditions, Smagorinsky
+turbulence modelling, expression-level code optimization and even a full ray marcher for just-in-time volumetric visualization in addition
+to a set of interesting examples. All of this runs on GPUs using CUDA near the maximum possible performance on that platform.
+
+*This document is a [[*Open tasks][work in progress]].*
+#+BEGIN_EXPORT html
+<video style="width:100%" src="https://literatelb.org/talk/fire.webm" onloadeddata="this.play();" playsinline loop muted/>
+#+END_EXPORT
+#+END_ABSTRACT
+#+TOC: headlines 2
+
+* Preamble                                                                                :noexport:
+#+BEGIN_SRC python :session :results none
+from sympy import *
+from sympy.codegen.ast import CodeBlock, Assignment
+from mako.template import Template
+import itertools
+
+def printlatexpr(*exprs):
+    print("$$\\begin{align*}")
+    for expr in exprs:
+        print(f"{latex(expr.lhs)} &:= {latex(expr.rhs)} \\\\")
+    
+    print("\\end{align*}$$")
+
+def multirange(*ranges):
+    return itertools.product(*[ range(r) for r in ranges ])
+
+def multisum(f, *ranges):
+    return sum([ f(*idx) for idx in multirange(*ranges) ])
+
+descriptor = { }
+#+END_SRC
+
+#+NAME: eval-using-descriptor
+#+BEGIN_SRC python :session :results output :var src=""
+if lattice in descriptor:
+    D = descriptor[lattice]
+    print(Template(src).render(**locals()))
+#+END_SRC
+
+#+RESULTS: eval-using-descriptor
+
+* Introduction
+Approaches to modelling fluid dynamics can be coarsely grouped into three categories: Microscopic, Mesoscopic and Macroscopic models.
+Microscopic models describe the behavior of the individual /particles/ of whose the macroscopic fluid movement is an emergent property.
+On the other side of the spectrum, macroscopic models consider only the large scale properties of a fluid such as its velocity and density
+fields. One could say that microscopic particle models are closest to what actually happens in nature and macroscopic models in the form
+of the /Navier Stokes equations (NSE)/ represent an ideal vision of a fluid that only holds for /large enough/ space scales.
+As is the case for most things, what we call a /fluid/ is by itself already an abstraction.
+
+Mesoscopic models sit between these two extremes and consider neither individual particles nor purely macroscopic properties but rather
+the probability of some amount of particles moving with some velocity into a certain direction at various points in time and space. The
+mathematical field of /kinetic theory/ provides a foundation for linking all of these three models.
+
+** Kinetic theory
+One can use kinetic theory to get from a microscropic or /molecular/ system governed by e.g. Newton's laws of motion to macroscopic
+descriptions of certain properties of such a system. For example one can reach the diffusion equation for /billiard systems/ of particles
+colliding with solid obstacles or to the Navier Stokes equations for systems of pairwise colliding particles.
+
+In very rough terms one starts out by considering the microscopic system and taking its limit for infinitely many particles of
+vanishingly small size. This yields a /kinetic equation/ that describes the behaviour of the particle distribution function
+under some collision operator for any point in space and time. When we consider pairwise colliding particles the result
+turns out to be the well known /Boltzmann transport equation/:
+
+$$(\partial_t + v \cdot \nabla_x) f(t,x,v) = \Omega (f).$$
+
+This PDE describes the behavior of the probability $f(t,x,v)$ that particles move into direction $v$ at location $x$ and time $t$ for
+some collision operator $\Omega$. Discretizing this equation on a lattice for a finite set of discrete velocities is called the /Lattice Boltzmann Method (LBM)/.
+
+The practical usefulness of LBM hinges on the ability of reaching the desired /target equations/
+such as NSE or the heat equation as a limit of the kinetic equation.
+
+** Lattice Boltzmann Method
+We discretize the Boltzmann equation in spatial, velocity and temporal space on a cartesian lattice using discrete velocities
+$\xi_i$ that point to the neighbors of each cell.
+
+$$(\partial_t + v \cdot \nabla_x)f = \Omega(f) \approx f_i(x + \xi_i, t + 1) - f_i(x,t) = \Omega_i(x,t)$$
+
+A common collision operator $\Omega$ for Navier-Stokes approximation is given by Bhatnagar, Gross and Kroog (BGK).
+
+$$\Omega(f) := -\frac{f - f^\text{eq}}{\tau}$$
+
+This BGK operator /relaxes/ the local population $f$ towards some equilibrium distribution $f^\text{eq}$ with rate $\tau$. Inserting this operator
+into the discrete Boltzmann equation yields the discrete LBM BGK equation.
+
+$$f_i(x + \xi_i, t + 1) = f_i(x,t) - \frac{1}{\tau} (f_i(x,t) - f_i^\text{eq}(x,t))$$
+
+This explicit form exposes the separation of the LBM algorithm into a local collision step followed by a streaming step.
+The collision step relaxes the population of each cell towards its local equilibrium and the streaming step propagates the resulting
+updated population values to the respective neighboring cells.
+
+Macroscopic moments such as density and velocity can be calculated directly from the distribution functions.
+
+$$\rho := \sum_{i=0}^{q-1} f_i(x,t) \text{ and } u := \frac{1}{\rho} \sum_{i=0}^{q-1} \xi_i f_i(x,t)$$
+
+The equilibrium distribution $f_i^\text{eq}$ for these macroscopic properties is given by
+
+$$f_i^\text{eq}(x,t) := \omega_i \rho \left( 1 + \frac{u \cdot \xi_i}{c_s^2} + \frac{(u \cdot \xi_i)^2}{2c_s^4} - \frac{u \cdot u}{2c_s^2} \right)$$
+
+using lattice-dependent weights $\omega_i$ and speed of sound $c_s$. Note that this is just one possible discretization of the Boltzmann
+equation -- nothing stops us from using different sets of velocities, relaxation times, collision operators and so on.
+Changing these things is how different physics are modeled in LBM. e.g. what we will do in the section on
+[[*Smagorinsky BGK collision][Smagorinsky turbulence modelling]] is to locally change the relaxation time.
+
+To summarize what we are going to do for the simplest bulk fluid case: First calculate the current density and velocity
+moments of a cell, then compute the matching equilibrium distributions and finally perform the local BGK collision to
+update the cell populations.
+The last step before we start again is to propagate the post-collision values to the corresponding neighbor cells. 
+
+Special care has to be taken for the boundary conditions at the lattice frontier and around any obstacle geometries.
+Such boundary conditions are one of the major topics of LBM research with a very rich toolbox of specialized collision
+steps for modeling e.g. inflow, outflow, solid or moving walls of various kinds.
+
+* Lattice
+The cartesian grids used for the spatial discretization are commonly described as =DXQY= lattices where =X= is the number of spatial dimensions
+and =Y= is the number of discrete velocities $\xi_i$. e.g. a  =D2Q9= lattice is two dimensional and stores nine population values per cell. Each population has
+an associated weigth $\omega_i$ that in a sense controls its impact on the collision step. Additionally we also require the lattice speed of sound $c_s$ which is
+the speed of information propagation within the lattice.
+
+#+BEGIN_SRC python :session :results none
+from fractions import Fraction
+
+class Descriptor:
+    def __init__(self, name, data):
+        self.name = name
+        self.c = [ Matrix(eval(row[0]))    for row in data ]
+        self.w = [ Rational(f.numerator, f.denominator)
+                   for f in [ Fraction(row[1]) for row in data ] ]
+        
+        self.d = self.c[0].shape[0]
+        self.q = len(self.c)
+        self.c_s = sqrt(Rational(1,3))
+#+END_SRC
+
+All of these constants have to be accessible for symbolic computations which is why we store them within a so called descriptor class.
+For convenience we write out the directions and weights as plain tables that are then read into the Python structure.
+
+#+NAME: load-descriptor
+#+BEGIN_SRC python :session :results output :var data=D2Q9
+descriptor[lattice] = Descriptor(lattice, data)
+print(f"Loaded D{descriptor[lattice].d}Q{descriptor[lattice].q} lattice with a weight sum of {sum(descriptor[lattice].w)}.")
+#+END_SRC
+
+#+RESULTS: load-descriptor
+: Loaded D2Q9 lattice with a weight sum of 1.
+
+#+CALL: load-descriptor(lattice="D3Q19", data=D3Q19)
+
+#+RESULTS:
+: Loaded D3Q19 lattice with a weight sum of 1.
+
+** D2Q9
+#+NAME: D2Q9
+| Direction | Weight |
+|-----------+--------|
+| (-1,-1)   | 1/36   |
+| (-1, 0)   | 1/9    |
+| (-1, 1)   | 1/36   |
+| ( 0,-1)   | 1/9    |
+| ( 0, 0)   | 4/9    |
+| ( 0, 1)   | 1/9    |
+| ( 1,-1)   | 1/36   |
+| ( 1, 0)   | 1/9    |
+| ( 1, 1)   | 1/36   |
+
+** D3Q19
+#+NAME: D3Q19
+| Direction  | Weight |
+|------------+--------|
+| ( 0, 1, 1) | 1/36   |
+| (-1, 0, 1) | 1/36   |
+| ( 0, 0, 1) | 1/18   |
+| ( 1, 0, 1) | 1/36   |
+| ( 0,-1, 1) | 1/36   |
+| (-1, 1, 0) | 1/36   |
+| ( 0, 1, 0) | 1/18   |
+| ( 1, 1, 0) | 1/36   |
+| (-1, 0, 0) | 1/18   |
+| ( 0, 0, 0) | 1/3    |
+| ( 1, 0, 0) | 1/18   |
+| (-1,-1, 0) | 1/36   |
+| ( 0,-1, 0) | 1/18   |
+| ( 1,-1, 0) | 1/36   |
+| ( 0, 1,-1) | 1/36   |
+| (-1, 0,-1) | 1/36   |
+| ( 0, 0,-1) | 1/18   |
+| ( 1, 0,-1) | 1/36   |
+| ( 0,-1,-1) | 1/36   |
+
+* Collision steps
+While the streaming step of the LBM algorithm only propagates the populations between cells in an unform fashion,
+the collision step determines the actual values those populations take. This means that the physical behaviour
+modelled by a given LBM algorithm is determined primarily by the collision step.
+
+In this section we are going to generate the code for bulk collisions. i.e. the collisions that model fluid and other transport
+phenomena apart from domain boundaries. Those will be handled at a later point by special purpose collision steps called
+/boundary conditions/.
+
+** Code printing
+Before we can get started on constructing expression trees and generating code from them we need to
+setup some basics so SymPy actually generates something we can compile in our environment.
+
+In order to get more fine grained control we need to overload an approproate SymPy C code printer class.
+This allows us to e.g. easily print indexed expressions that access the population array in the correct way
+or to explicitly type float constants according to a compile-time template type.
+
+#+BEGIN_SRC python :session :results none
+from sympy.printing.ccode import C11CodePrinter
+
+class CodeBlockPrinter(C11CodePrinter):
+    def __init__(self, custom_assignment, custom_functions):
+        super(CodeBlockPrinter, self).__init__()
+        self._default_settings['contract'] = False
+        self.custom_assignment = custom_assignment
+        for f in custom_functions:
+            self._kf[f] = f
+     
+    def _print_Indexed(self, expr):
+        assert len(expr.indices) == 1
+        if expr.base.name[0] == 'f':
+            return f"{expr.base.name}[{expr.indices[0]}]"
+        else:
+            return f"{expr.base.name}_{expr.indices[0]}"
+    
+    def _print_Float(self, flt):
+        return "T{%s}" % str(flt.evalf())
+
+    def _print_Pow(self, expr):
+        if expr.exp == -1:
+            return "T{1} / (%s)" % self.doprint(expr.base)
+        else:
+            return super()._print_Pow(expr)
+
+    def _print_Assignment(self, expr):
+        if self.custom_assignment and expr.lhs.is_Indexed and expr.lhs.base.name[0] == 'f':
+            return f"{self.doprint(expr.lhs)} = {self.doprint(expr.rhs.evalf())};"
+        else:
+            return f"T {self.doprint(expr.lhs)} = {self.doprint(expr.rhs.evalf())};"
+#+END_SRC
+
+For convenience the instantiation of this class is hidden in a =printcode= function that we can use everywhere.
+
+#+BEGIN_SRC python :session :results none
+def printcode(expr, custom_assignment=True, custom_functions=[]):
+    print(CodeBlockPrinter(custom_assignment, custom_functions).doprint(expr))
+#+END_SRC
+
+The additional assignment parameter allow us to control whether the targets of assignment expressions should be
+instantiated as a new scalar variable in addition while the functions parameter allows us to make custom runtime
+functions known to SymPy. If we don't do this for a function =f= that our assignment expression contains a reference
+to, the printer will generate unwanted comments to reflect this.
+
+*** Custom expression transformations
+We do not want to use the =pow= function for squares in the generated code. This can be achieved by providing
+a custom =ReplaceOptim= structure during the CSE optimization step that conditionally resolves =Pow= expressions.
+
+#+BEGIN_SRC python :session :results none
+from sympy.codegen.rewriting import ReplaceOptim
+
+expand_pos_square = ReplaceOptim(
+    lambda e: e.is_Pow and e.exp.is_integer and e.exp == 2,
+    lambda p: UnevaluatedExpr(Mul(p.base, p.base, evaluate = False))
+)
+
+custom_opti = cse_main.basic_optimizations + [
+    (expand_pos_square, expand_pos_square)
+]
+#+END_SRC
+
+#+BEGIN_SRC python :session :results none
+def cse(block, symbol_prefix = 'x'):
+    return block.cse(symbols = numbered_symbols(prefix=symbol_prefix), optimizations = custom_opti, order = 'none')
+#+END_SRC
+
+** Moments
+#+BEGIN_SRC python :session :results none
+i, j = symbols('i, j')
+d, q = symbols('d, q')
+xi = IndexedBase('xi')
+#+END_SRC
+
+To start we define placeholders for the spatial and discrete velocity dimensions as well as the velocity set $\xi$ of
+some lattice. As the moments are constructed using the lattice populations $f$ we also require placeholders
+for those in addition to the moments $\rho$ and $u$ themselves.
+
+#+BEGIN_SRC python :session :results none
+f = IndexedBase('f')
+rho = Symbol('rho')
+u = IndexedBase('u', d)
+u_i = Symbol('u_i')
+#+END_SRC
+
+We are now ready to formulate the density moment which is simply the sum of a cell's populations.
+
+#+BEGIN_SRC python :session :results output replace :wrap latex
+def rho_from_f(f, q):
+    return Assignment(rho, Sum(f[j], (j, 0, q-1)))
+
+printlatexpr(rho_from_f(f, q))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+\rho &:= \sum_{j=0}^{q - 1} {f}_{j} \\
+\end{align*}$$
+#+end_latex
+
+Next we build the velocity moment function. The i-th component of the velocty $u$ is the sum of all
+relevant populations divided by the density. In this context /relevant/ populations are all values $f_j$ 
+for which the i-th component of velocity $\xi_j$ is non-zero.
+
+#+BEGIN_SRC python :session :results output replace :wrap latex
+def u_i_from_f(f, q, xi):
+    return Assignment(u_i, Sum(xi[j,i] * f[j], (j, 0, q-1)) / Sum(f[j], (j, 0, q-1)))
+
+printlatexpr(u_i_from_f(f, q, xi))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+u_{i} &:= \frac{\sum_{j=0}^{q - 1} {f}_{j} {\xi}_{j,i}}{\sum_{j=0}^{q - 1} {f}_{j}} \\
+\end{align*}$$
+#+end_latex
+
+Both to illustrate what we are actually going to compute for a given lattice cell and as the
+next step towards code generation we now want to /realize/ our abstract moment expressions
+for a concrete lattice.
+
+#+BEGIN_SRC python :session :results output :wrap latex
+def realize_rho(D):
+    return rho_from_f(f, D.q).doit()
+
+D = descriptor[lattice]
+printlatexpr(realize_rho(D))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+\rho &:= {f}_{0} + {f}_{1} + {f}_{2} + {f}_{3} + {f}_{4} + {f}_{5} + {f}_{6} + {f}_{7} + {f}_{8} \\
+\end{align*}$$
+#+end_latex
+
+#+BEGIN_SRC python :session :results none
+def from_indices(elems, car, *cdr):
+    return elems[car] if len(cdr) == 0 else from_indices(elems[car], *cdr)
+
+def realize_indexed(expr, idx, values):
+    return expr.replace(lambda expr: expr.is_Indexed and expr.base.name == idx.name,
+                        lambda elem: from_indices(values, *elem.indices))
+#+END_SRC
+
+#+BEGIN_SRC python :session :results output :wrap latex
+def realize_u_i(D, j):
+    return realize_indexed(u_i_from_f(f, D.q, xi).doit().subs([(u_i, u[j]), (i, j)]), xi, D.c)
+
+D = descriptor[lattice]
+printlatexpr(realize_u_i(D, 0))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+{u}_{0} &:= \frac{- {f}_{0} - {f}_{1} - {f}_{2} + {f}_{6} + {f}_{7} + {f}_{8}}{{f}_{0} + {f}_{1} + {f}_{2} + {f}_{3} + {f}_{4} + {f}_{5} + {f}_{6} + {f}_{7} + {f}_{8}} \\
+\end{align*}$$
+#+end_latex
+
+At this point we have everything needed to generate an optimized code snippet that
+can be used to compute density and velocity values given a set of variables containing
+a single cell's populations. As a convention we are going to prefix these /current/ population
+variables with =f_curr=.
+
+#+BEGIN_SRC python :session :results none
+def moments_code_block(D, populations):
+    f_rho = realize_indexed(realize_rho(D), f, populations)
+    f_u = [ realize_indexed(realize_u_i(D, i), f, populations) for i in range(D.d) ]
+    return CodeBlock(f_rho, *f_u).cse(symbols = numbered_symbols(prefix='m'), optimizations = custom_opti)
+#+END_SRC
+
+#+NAME: moments-from-f_curr
+#+BEGIN_SRC python :session :results output :cache yes
+printcode(moments_code_block(descriptor[lattice], IndexedBase('f_curr')))
+#+END_SRC
+
+#+RESULTS[11633b87f250d3c0a8e39c7091e10d2ef75b19dc]: moments-from-f_curr
+: T m0 = f_curr[1] + f_curr[2];
+: T m1 = f_curr[3] + f_curr[6];
+: T m2 = m0 + m1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8];
+: T m3 = f_curr[0] - f_curr[8];
+: T m4 = T{1} / (m2);
+: T rho = m2;
+: T u_0 = -m4*(m0 + m3 - f_curr[6] - f_curr[7]);
+: T u_1 = -m4*(m1 + m3 - f_curr[2] - f_curr[5]);
+
+Both the fluid dynamics implemented by collision steps and boundary conditions as well as functions
+for computing moments to e.g. visualize are implemented as GPU kernel functions. In CUDA this means
+plain functions marked by the =__global__= specifier. As most kernel share common aspects in their call
+requirements and parameter sets we wrap them in =__device__= functions of sensibly named structures.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/collect_moments.h
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct CollectMomentsF {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], std::size_t gid, T* cell_rho, T* cell_u) {
+  <<moments-from-f_curr(lattice="D2Q9")>>
+
+  cell_rho[gid] = rho;
+  cell_u[2*gid+0] = u_0;
+  cell_u[2*gid+1] = u_1;
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], std::size_t gid, T* cell_rho, T* cell_u) {
+  <<moments-from-f_curr(lattice="D3Q19")>>
+
+  cell_rho[gid] = rho;
+  cell_u[3*gid+0] = u_0;
+  cell_u[3*gid+1] = u_1;
+  cell_u[3*gid+2] = u_2;
+}
+
+};
+#+END_SRC
+
+** Equilibrium
+#+BEGIN_SRC python :session :results none
+f_eq = IndexedBase('f_eq', q)
+#+END_SRC
+
+$$f_i^\text{eq}(x,t) := \omega_i \rho \left( 1 + \frac{u \cdot \xi_i}{c_s^2} + \frac{(u \cdot \xi_i)^2}{2c_s^4} - \frac{u \cdot u}{2c_s^2} \right)$$
+
+Calculating the equilibrium distribution of some population $f_i$ requires the evaluation of inner products
+between vectors. As there doesn't seem to be a nice way of writing an abstract SymPy expression that both
+generates nice LaTeX and can be realized on a concrete lattice we skip the fully abstract step and jump
+right into the latter part.
+
+#+BEGIN_SRC python :session :results output :wrap latex
+def realize_f_eq_i(D, i):
+    v = Matrix([ u[j] for j in range(D.d) ])
+    return Assignment(f_eq[i], D.w[i] * rho * ( 1
+                                              + D.c[i].dot(v)    /    D.c_s**2
+                                              + D.c[i].dot(v)**2 / (2*D.c_s**4)
+                                              - v.dot(v)         / (2*D.c_s**2) ))
+
+D = descriptor[lattice]
+printlatexpr(realize_f_eq_i(D, 0))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+{f_{eq}}_{0} &:= \frac{\rho \left(\frac{9 \left(- {u}_{0} - {u}_{1}\right)^{2}}{2} - \frac{3 {u}_{0}^{2}}{2} - 3 {u}_{0} - \frac{3 {u}_{1}^{2}}{2} - 3 {u}_{1} + 1\right)}{36} \\
+\end{align*}$$
+#+end_latex
+
+#+BEGIN_SRC python :session :results none
+def equilibrium_code_block(D):
+    f_moment_eq = [ Assignment(f_next[i], realize_f_eq_i(D, i).rhs) for i in range(D.q) ]
+    return cse(CodeBlock(*f_moment_eq), symbol_prefix = 'e')
+#+END_SRC
+
+#+NAME: equilibrium-from-moments
+#+BEGIN_SRC python :session :results output :cache yes
+printcode(equilibrium_code_block(descriptor[lattice]))
+#+END_SRC
+
+#+RESULTS[7843a5ae8cf9ac0b63787ab4080ba2488a6cc0a7]: equilibrium-from-moments
+#+begin_example
+T e0 = T{0.0277777777777778}*rho;
+T e1 = T{3.00000000000000}*u_1;
+T e2 = T{3.00000000000000}*u_0;
+T e3 = u_0 + u_1;
+T e4 = T{4.50000000000000}*(e3*e3);
+T e5 = u_1*u_1;
+T e6 = T{1.50000000000000}*e5;
+T e7 = u_0*u_0;
+T e8 = T{1.50000000000000}*e7;
+T e9 = e8 + T{-1.00000000000000};
+T e10 = e6 + e9;
+T e11 = T{0.111111111111111}*rho;
+T e12 = -e2;
+T e13 = T{1.00000000000000} - e6;
+T e14 = e13 + T{3.00000000000000}*e7;
+T e15 = -e8;
+T e16 = e1 + e15;
+T e17 = u_0 - u_1;
+T e18 = e13 + T{4.50000000000000}*(e17*e17);
+T e19 = T{3.00000000000000}*e5;
+f_next[0] = -e0*(e1 + e10 + e2 - e4);
+f_next[1] = e11*(e12 + e14);
+f_next[2] = e0*(e12 + e16 + e18);
+f_next[3] = -e11*(e1 - e19 + e9);
+f_next[4] = -T{0.444444444444444}*e10*rho;
+f_next[5] = e11*(e16 + e19 + T{1.00000000000000});
+f_next[6] = e0*(-e1 + e15 + e18 + e2);
+f_next[7] = e11*(e14 + e2);
+f_next[8] = e0*(e13 + e16 + e2 + e4);
+#+end_example
+
+For initialization of lattice data it often makes sense to choose values that are invariant under this
+equilibrium computation. Cells initialized in this way will not change during the collision step
+and as such can be considered to model a fluid /at rest/.
+
+#+BEGIN_SRC python :session :results none
+def initialize_equilibrium(D):
+    return [ Assignment(IndexedBase("f_next", D.q)[i], w_i) for i, w_i in enumerate(D.w) ]
+#+END_SRC
+
+#+NAME: initialize-populations
+#+BEGIN_SRC python :session :results output
+D = descriptor[lattice]
+printcode(CodeBlock(*initialize_equilibrium(D)))
+#+END_SRC
+
+#+RESULTS: initialize-populations
+: f_next[0] = T{0.0277777777777778};
+: f_next[1] = T{0.111111111111111};
+: f_next[2] = T{0.0277777777777778};
+: f_next[3] = T{0.111111111111111};
+: f_next[4] = T{0.444444444444444};
+: f_next[5] = T{0.111111111111111};
+: f_next[6] = T{0.0277777777777778};
+: f_next[7] = T{0.111111111111111};
+: f_next[8] = T{0.0277777777777778};
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/initialize.h
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct InitializeO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) {
+  <<initialize-populations(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) {
+  <<initialize-populations(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+** BGK collision
+The BGK collision operators takes a current population $f^{curr}_i$ and /relaxes/ it toward the equilibrium distribution
+$f^{eq}_i$ with some rate $\tau$. The result of this process is the new population $f^{next}_i$.
+
+#+BEGIN_SRC python :session :results none
+tau = Symbol('tau')
+f_curr = IndexedBase('f_curr', q)
+f_next = IndexedBase('f_next', q)
+f_curr_i, f_next_i, f_eq_i = symbols('f^curr_i, f^next_i, f^eq_i')
+#+END_SRC
+
+#+BEGIN_SRC python :session :results output :wrap latex
+def bgk_collision(f_curr, f_next, f_eq, tau):
+    return Assignment(f_next, f_curr + 1/tau * (f_eq - f_curr))
+
+printlatexpr(bgk_collision(f_curr_i, f_next_i, f_eq_i, tau))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+f^{next}_{i} &:= f^{curr}_{i} + \frac{- f^{curr}_{i} + f^{eq}_{i}}{\tau} \\
+\end{align*}$$
+#+end_latex
+
+As we might want to use different moment values than the ones constructed from the current population 
+as the foundation for the equilibrium distribution the generated code will assume variables =rho= and =u_i=
+to exist. Building the expression tree for code generation is now as simple as instantiating the BGK
+operator for all $q$ directions and substituting the equilibrium distribution.
+
+#+BEGIN_SRC python :session :results none
+def bgk_collision_code_block(D):
+    f_eq_def = [ realize_f_eq_i(D, i).rhs for i in range(D.q) ]
+    f_post_collide = [ bgk_collision(f_curr[i], f_next[i], f_eq_def[i], tau) for i in range(D.q) ]
+    return CodeBlock(*f_post_collide).cse(optimizations = custom_opti)
+#+END_SRC
+
+#+NAME: bgk-collide-to-f_next
+#+BEGIN_SRC python :session :results output :cache yes
+printcode(bgk_collision_code_block(descriptor[lattice]), custom_assignment=True)
+#+END_SRC
+
+#+RESULTS[b9035596468cbdcf22f3e63f7fa43d74a6261849]: bgk-collide-to-f_next
+#+begin_example
+T x0 = T{1} / (tau);
+T x1 = T{0.0138888888888889}*x0;
+T x2 = T{6.00000000000000}*u_1;
+T x3 = T{6.00000000000000}*u_0;
+T x4 = u_0 + u_1;
+T x5 = T{9.00000000000000}*(x4*x4);
+T x6 = u_1*u_1;
+T x7 = T{3.00000000000000}*x6;
+T x8 = u_0*u_0;
+T x9 = T{3.00000000000000}*x8;
+T x10 = x9 + T{-2.00000000000000};
+T x11 = x10 + x7;
+T x12 = T{0.0555555555555556}*x0;
+T x13 = -x3;
+T x14 = T{2.00000000000000} - x7;
+T x15 = x14 + T{6.00000000000000}*x8;
+T x16 = -x9;
+T x17 = x16 + x2;
+T x18 = u_0 - u_1;
+T x19 = x14 + T{9.00000000000000}*(x18*x18);
+T x20 = T{6.00000000000000}*x6;
+f_next[0] = -x1*(rho*(x11 + x2 + x3 - x5) + T{72.0000000000000}*f_curr[0]) + f_curr[0];
+f_next[1] = x12*(rho*(x13 + x15) - T{18.0000000000000}*f_curr[1]) + f_curr[1];
+f_next[2] = x1*(rho*(x13 + x17 + x19) - T{72.0000000000000}*f_curr[2]) + f_curr[2];
+f_next[3] = -x12*(rho*(x10 + x2 - x20) + T{18.0000000000000}*f_curr[3]) + f_curr[3];
+f_next[4] = -T{0.111111111111111}*x0*(T{2.00000000000000}*rho*x11 + T{9.00000000000000}*f_curr[4]) + f_curr[4];
+f_next[5] = x12*(rho*(x17 + x20 + T{2.00000000000000}) - T{18.0000000000000}*f_curr[5]) + f_curr[5];
+f_next[6] = x1*(rho*(x16 + x19 - x2 + x3) - T{72.0000000000000}*f_curr[6]) + f_curr[6];
+f_next[7] = x12*(rho*(x15 + x3) - T{18.0000000000000}*f_curr[7]) + f_curr[7];
+f_next[8] = x1*(rho*(x14 + x17 + x3 + x5) - T{72.0000000000000}*f_curr[8]) + f_curr[8];
+#+end_example
+
+In order to call this collision kernel on the GPU we wrap it in =apply= functions of an appropriately
+named operator structure.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/collide.h
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct BgkCollideO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T tau) {
+  <<moments-from-f_curr(lattice="D2Q9")>>
+  <<bgk-collide-to-f_next(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T tau) {
+  <<moments-from-f_curr(lattice="D3Q19")>>
+  <<bgk-collide-to-f_next(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/bulk.h
+#include "kernel/collide.h"
+#+END_SRC
+
+** Smagorinsky BGK collision
+Simulation of turbulent flow using plain BGK collisions is possible -- after all turbulence is captured by the Navier-Stokes
+equations that in turn are the target equations of our BGK-LB method -- but requires very highly resolved grids. The reason
+for this is that turbulence is characterized by the formation of eddies at both very big and very small scales. Most of the
+energy is contained in the large scale features but dissipates into heat at the finest scales. To capture turbulent flow we either
+have to resolve the grid all the way to these finest scales or implement some kind of model for the characteristics of these
+scales in a coarser grid. Computation on such a coarser grid is then also called a large eddy simulation (LES).
+
+One comparably simple model for respresenting the smaller eddies in such a LES is the Smagorinsky subgrid-scale model.
+This model yields a expression for computing the /effective relaxation rate/ $\tau_\text{eff}$ on a per-cell basis given the global relaxation
+time $\tau$ and a Smagorinsky constant. As the relaxation time in BGK LBM is a function of the viscosity this translates into
+computing the effective viscosity using a local strain-rate tensor reconstruction based on the non-equilibrium part of each
+cell's populations.
+
+#+BEGIN_SRC python :session :results none
+tau, smagorinsky = symbols('tau, smagorinsky')
+#+END_SRC
+
+The non-equilibrium part is simply the difference between the actual population stored in a cell and the respective
+equilibrium population that we relax towards. Using these local non-equilibrium parts to reconstruct the strain-rate
+tensor $\Pi_{i,j}^\text{neq}$ is quite convenient as we otherwise would have to employ e.g. a finite difference method just for this.
+
+#+BEGIN_SRC python :session :results none
+def pi_neq(D, f, f_eq):
+    pi_neq = zeros(D.d, D.d)
+    for i, j, k in multirange(D.d, D.d, D.q):
+        pi_neq[i,j] += D.c[k][i] * D.c[k][j] * (f[k] - f_eq[k])
+    
+    return pi_neq
+#+END_SRC
+
+To compute the effective relaxation rate we need the norm of this strain-rate tensor.
+
+#+BEGIN_SRC python :session :results none
+def pi_neq_norm(D, f, f_eq):
+    pi = pi_neq(D, f, f_eq)
+    return sqrt(2*multisum(lambda i, j: pi[i,j]**2, D.d, D.d))
+#+END_SRC
+
+#+BEGIN_SRC python :session :results none
+def effective_tau(D, f, f_eq, tau, smagorinsky):
+    pi_norm = pi_neq_norm(D, f, f_eq)
+    return tau + 0.5*(sqrt(tau**2 + 2*sqrt(2)*smagorinsky**2 * pi_norm / D.c_s**4) - tau)
+#+END_SRC
+
+Finally the resulting per-cell relaxation time is simply plugged into the existing BGK collision operator to yield the
+complete Smagorinsky BGK collision step.
+
+#+BEGIN_SRC python :session :results none
+def smagorinsky_bgk_collision_code_block(D, tau, smagorinsky):
+    f_rho = realize_indexed(realize_rho(D), f, f_curr)
+    f_u = [ realize_indexed(realize_u_i(D, i), f, f_curr) for i in range(D.d) ]
+    f_eq = [ realize_f_eq_i(D, i).rhs for i in range(D.q) ]
+    eff_tau = effective_tau(D, f_curr, f_eq, tau, smagorinsky)
+    f_post_collide = [ bgk_collision(f_curr[i], f_next[i], f_eq[i], eff_tau) for i in range(D.q) ]
+    return CodeBlock(f_rho, *f_u, *f_post_collide).cse(optimizations = custom_opti)
+#+END_SRC
+
+This way the BGK collisions are numerically stabilized for low resolutions and high Reynolds numbers.
+
+#+NAME: smagorinsky-bgk-collide-to-f_next
+#+BEGIN_SRC python :session :results output :cache yes
+D = descriptor[lattice]
+printcode(smagorinsky_bgk_collision_code_block(D, tau, smagorinsky))
+#+END_SRC
+
+#+RESULTS[b6cd612f0c97e3da6908e302f304b8a50516eb05]: smagorinsky-bgk-collide-to-f_next
+#+begin_example
+T x0 = f_curr[1] + f_curr[2];
+T x1 = f_curr[3] + f_curr[6];
+T x2 = x0 + x1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8];
+T x3 = f_curr[0] - f_curr[8];
+T x4 = T{1} / (x2);
+T x5 = T{72.0000000000000}*f_curr[2];
+T x6 = T{72.0000000000000}*f_curr[6];
+T rho = x2;
+T x31 = T{4.00000000000000}*rho;
+T x40 = T{2.00000000000000}*rho;
+T u_0 = -x4*(x0 + x3 - f_curr[6] - f_curr[7]);
+T x7 = T{6.00000000000000}*u_0;
+T x8 = -x7;
+T x15 = u_0*u_0;
+T x16 = T{3.00000000000000}*x15;
+T x17 = -x16;
+T x27 = x16 + T{-2.00000000000000};
+T u_1 = -x4*(x1 + x3 - f_curr[2] - f_curr[5]);
+T x9 = u_0 - u_1;
+T x10 = T{9.00000000000000}*(x9*x9);
+T x11 = u_1*u_1;
+T x12 = T{3.00000000000000}*x11;
+T x13 = T{2.00000000000000} - x12;
+T x14 = T{6.00000000000000}*u_1;
+T x18 = x14 + x17;
+T x19 = x13 + x18;
+T x20 = x10 + x19 + x8;
+T x21 = rho*x20;
+T x22 = x10 + x13 - x14 + x17 + x7;
+T x23 = rho*x22;
+T x24 = u_0 + u_1;
+T x25 = T{9.00000000000000}*(x24*x24);
+T x26 = x19 + x25 + x7;
+T x28 = x12 + x27;
+T x29 = x14 - x25 + x28 + x7;
+T x30 = rho*x26 - rho*x29 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8];
+T x32 = x13 + T{6.00000000000000}*x15;
+T x33 = x32 + x8;
+T x34 = x32 + x7;
+T x35 = x21 + x23 + x30 - x5 - x6;
+T x36 = T{6.00000000000000}*x11;
+T x37 = x14 + x27 - x36;
+T x38 = x18 + x36 + T{2.00000000000000};
+T x39 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt((-x21 - x23 + x30 + x5 + x6)*(-x21 - x23 + x30 + x5 + x6) + T{0.500000000000000}*((x31*x33 + x31*x34 + x35 - 72*f_curr[1] - 72*f_curr[7])*(x31*x33 + x31*x34 + x35 - 72*f_curr[1] - 72*f_curr[7])) + T{0.500000000000000}*((-x31*x37 + x31*x38 + x35 - 72*f_curr[3] - 72*f_curr[5])*(-x31*x37 + x31*x38 + x35 - 72*f_curr[3] - 72*f_curr[5]))) + tau*tau));
+f_next[0] = -T{0.0138888888888889}*x39*(x29*x40 + T{144.000000000000}*f_curr[0]) + f_curr[0];
+f_next[1] = T{0.0555555555555556}*x39*(x33*x40 - T{36.0000000000000}*f_curr[1]) + f_curr[1];
+f_next[2] = T{0.0138888888888889}*x39*(x20*x40 - T{144.000000000000}*f_curr[2]) + f_curr[2];
+f_next[3] = -T{0.0555555555555556}*x39*(x37*x40 + T{36.0000000000000}*f_curr[3]) + f_curr[3];
+f_next[4] = -T{0.111111111111111}*x39*(T{4.00000000000000}*rho*x28 + T{18.0000000000000}*f_curr[4]) + f_curr[4];
+f_next[5] = T{0.0555555555555556}*x39*(x38*x40 - T{36.0000000000000}*f_curr[5]) + f_curr[5];
+f_next[6] = T{0.0138888888888889}*x39*(x22*x40 - T{144.000000000000}*f_curr[6]) + f_curr[6];
+f_next[7] = T{0.0555555555555556}*x39*(x34*x40 - T{36.0000000000000}*f_curr[7]) + f_curr[7];
+f_next[8] = T{0.0138888888888889}*x39*(x26*x40 - T{144.000000000000}*f_curr[8]) + f_curr[8];
+#+end_example
+
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/smagorinsky_collide.h
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct SmagorinskyBgkCollideO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T tau, T smagorinsky) {
+  <<smagorinsky-bgk-collide-to-f_next(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T tau, T smagorinsky) {
+  <<smagorinsky-bgk-collide-to-f_next(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/bulk.h
+#include "kernel/smagorinsky_collide.h"
+#+END_SRC
+
+* Boundary conditions
+Real-world  simulations are limited by the available computational resources. This means that we can not allocate
+an infinitely large lattice and consequently we at a minimum need to prescribe some conditions for the outer boundary
+of our finite lattice -- even in cases where we only want to simulate a fluid without any obstacles.
+In practice we commonly want to do both: Prescribe some inflow and outflow conditions as well as various boundaries
+that represent some obstacle geometry. This way we could for example create a virtual wind tunnel where fluid enters
+the domain on one side, is kept in line by smooth free slip walls, encounters some obstacle whose aerodynamic
+properties we want to investigate and exits the simulation lattice on the other side.
+** Bounce back
+To fit bounce back's reputation as the simplest LBM boundary condition we do not require any
+fancy expression trickery to generate its code. This boundary condition simply reflects back
+all populations the way they came from. As such it models a solid wall with no tangential velocity
+at the boundary.
+
+#+BEGIN_SRC python :session :results none
+def bounce_back(D, populations):
+    return [ Assignment(f_next[i], populations[D.c.index(-c_i)]) for i, c_i in enumerate(D.c) ]
+#+END_SRC
+
+#+NAME: bounce-back-full-way
+#+BEGIN_SRC python :session :results output
+D = descriptor[lattice]
+printcode(CodeBlock(*bounce_back(D, IndexedBase('f_curr', D.q))))
+#+END_SRC
+
+#+RESULTS: bounce-back-full-way
+: f_next[0] = f_curr[8];
+: f_next[1] = f_curr[7];
+: f_next[2] = f_curr[6];
+: f_next[3] = f_curr[5];
+: f_next[4] = f_curr[4];
+: f_next[5] = f_curr[3];
+: f_next[6] = f_curr[2];
+: f_next[7] = f_curr[1];
+: f_next[8] = f_curr[0];
+
+If this is used to model the walls of a simple pipe setup we will observe the well known Poiseuille velocity profile.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/bounce_back.h
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct BounceBackO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid) {
+  <<bounce-back-full-way(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid) {
+  <<bounce-back-full-way(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/boundary.h
+#include "kernel/bounce_back.h"
+#+END_SRC
+
+** Moving wall bounce back
+Modeling solid unmoving obstacles using e.g. bounce back is nice enough but in practice
+we commonly want to induce some kind of movement in our fluids. While this can be done
+by e.g. prescribing an inflow velocity using [[*Prescribed equilibrium][prescribed equilibrium boundaries]], bounce
+back can be modified to represent not just a solid wall but also some momentum exerted
+on the fluid.
+
+#+BEGIN_SRC python :session :results none
+def moving_wall_correction(D, i):
+    u_raw = symarray('u', D.d)
+    return 2 * D.w[D.c.index(-D.c[i])] / D.c_s**2 * -D.c[i].dot(Matrix(u_raw))
+#+END_SRC
+
+The simplest way of incorporating such movement into bounce back is to add the velocity
+components tangential to the respective population's direction.
+
+#+BEGIN_SRC python :session :results none
+def moving_wall_bounce_back(D, populations):
+    return [ Assignment(expr.lhs, expr.rhs - moving_wall_correction(D, i))
+             for i, expr
+             in enumerate(bounce_back(D, populations)) ]
+#+END_SRC
+
+#+NAME: bounce-back-full-way-moving-wall
+#+BEGIN_SRC python :session :results output
+D = descriptor[lattice]
+printcode(CodeBlock(*moving_wall_bounce_back(D, IndexedBase('f_curr', D.q))))
+#+END_SRC
+
+#+RESULTS: bounce-back-full-way-moving-wall
+: f_next[0] = -T{0.166666666666667}*u_0 - T{0.166666666666667}*u_1 + f_curr[8];
+: f_next[1] = -T{0.666666666666667}*u_0 + f_curr[7];
+: f_next[2] = -T{0.166666666666667}*u_0 + T{0.166666666666667}*u_1 + f_curr[6];
+: f_next[3] = -T{0.666666666666667}*u_1 + f_curr[5];
+: f_next[4] = f_curr[4];
+: f_next[5] = T{0.666666666666667}*u_1 + f_curr[3];
+: f_next[6] = T{0.166666666666667}*u_0 - T{0.166666666666667}*u_1 + f_curr[2];
+: f_next[7] = T{0.666666666666667}*u_0 + f_curr[1];
+: f_next[8] = T{0.166666666666667}*u_0 + T{0.166666666666667}*u_1 + f_curr[0];
+
+Strictly speaking this should only be used to model tangentially moving walls (such as in the 
+[[*Lid driven cavity][lid driven cavity]] example). More complex situations are possible but require boundary
+conditions to e.g. track the position of obstacles during the simulation.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/bounce_back_moving_wall.h
+#pragma once
+#include <LLBM/call_tag.h>
+
+struct BounceBackMovingWallO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T u_0, T u_1) {
+  <<bounce-back-full-way-moving-wall(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T u_0, T u_1, T u_2) {
+  <<bounce-back-full-way-moving-wall(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/boundary.h
+#include "kernel/bounce_back_moving_wall.h"
+#+END_SRC
+
+** Free slip boundary
+This is another special case of the bounce back boundaries where populations are reflected
+specularly with respect to a given normal vector instead of simply bouncing them back
+the way they came from.
+
+#+BEGIN_SRC python :session :results none
+def bounce_back_free_slip(D, populations, n):
+    return [ Assignment(f_next[i], populations[D.c.index(2*n.dot(-c_i)*n+c_i)])
+             for i, c_i in enumerate(D.c) ]
+#+END_SRC
+
+#+NAME: bounce-back-full-way-specular-reflection
+#+BEGIN_SRC python :session :results output :var normal='(0 1)
+D = descriptor[lattice]
+printcode(CodeBlock(*bounce_back_free_slip(D, IndexedBase('f_curr', D.q), Matrix(normal))))
+#+END_SRC
+
+#+RESULTS: bounce-back-full-way-specular-reflection
+: f_next[0] = f_curr[2];
+: f_next[1] = f_curr[1];
+: f_next[2] = f_curr[0];
+: f_next[3] = f_curr[5];
+: f_next[4] = f_curr[4];
+: f_next[5] = f_curr[3];
+: f_next[6] = f_curr[8];
+: f_next[7] = f_curr[7];
+: f_next[8] = f_curr[6];
+
+Such a boundary condition is able to represent non-zero tangential /free slip/ velocities.
+The mapping between pre- and post-collision velocities is of course specific to each
+wall normal. We use tag dispatching for allowing the use to select which kind of wall
+each boundary condition call represents.
+
+#+BEGIN_SRC cpp :eval no :main no :tangle tangle/LLBM/wall.h
+#pragma once
+
+template <int N_0, int N_1, int N_2=0>
+struct WallNormal { };
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/free_slip.h
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/wall.h>
+#include <LLBM/descriptor.h>
+
+struct BounceBackFreeSlipO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, WallNormal<1,0>) {
+  <<bounce-back-full-way-specular-reflection(lattice="D2Q9", normal='(1 0))>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, WallNormal<0,1>) {
+  <<bounce-back-full-way-specular-reflection(lattice="D2Q9", normal='(0 1))>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, WallNormal<0,1,0>) {
+  <<bounce-back-full-way-specular-reflection(lattice="D3Q19", normal='(0 1 0))>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, WallNormal<0,0,1>) {
+  <<bounce-back-full-way-specular-reflection(lattice="D3Q19", normal='(0 0 1))>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/boundary.h
+#include "kernel/free_slip.h"
+#+END_SRC
+
+** Interpolated bounce back
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/bouzidi.h
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/lattice.h>
+
+<<bouzidi-config>>
+#+END_SRC
+
+Given a precomputed distance factor =q= we can compute an interpolated bounce back
+boundary.
+
+$$\begin{align*}
+f_i(x_f,t+\delta t) &= 2q f_j(x_f,t) + (1-2q) f_j(x_{f} + \delta x \xi_i,t)  && q \leq \frac{1}{2} \\
+f_i(x_f,t+\delta t) &= \frac{1}{2q}f_j(x_f,t) + \left(1 - \frac{1}{2q}\right) f_i(x_f,t) && q > \frac{1}{2}
+\end{align*}$$
+
+Note that the case distinction can be unified into a single case by precomputing
+distance and wall velocity correction factors.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/bouzidi.h
+struct BouzidiO {
+
+using call_tag = tag::call_by_list_index;
+
+template <typename T, typename S, typename DESCRIPTOR>
+__device__ static void apply(
+    LatticeView<DESCRIPTOR,S> lattice
+  , std::size_t index
+  , std::size_t count
+  , BouzidiConfig<S> config
+) {
+  pop_index_t& iPop = config.missing[index];
+  pop_index_t  jPop = descriptor::opposite<DESCRIPTOR>(iPop);
+  pop_index_t  kPop = config.boundary[index] == config.fluid[index] ? iPop : jPop;
+
+  S f_bound_j = *lattice.pop(jPop, config.boundary[index]);
+  S f_fluid_j = *lattice.pop(kPop, config.fluid[index]);
+  S* f_next_i =  lattice.pop(iPop, config.solid[index]);
+
+  *f_next_i = config.distance[index] * f_bound_j
+            + (1. - config.distance[index]) * f_fluid_j
+            + config.correction[index];
+}
+
+};
+#+END_SRC
+
+The cells locations $x_f$, $x_f + \xi_i$  in addition to distance factors $q$, velocity corrections and the
+missing population index to be reconstructed are stored in a =InterpolatedBounceBackConfig=
+structure. This simplifies passing of all relevant data to the GPU kernel.
+
+#+NAME: bouzidi-config
+#+BEGIN_SRC cpp
+template <typename S>
+struct BouzidiConfig {
+  std::size_t* boundary; // boundary cell to be interpolated
+  std::size_t* solid;    // adjacent solid cell
+  std::size_t* fluid;    // adjacent fluid cell
+  S* distance;           // precomputed distance factor q
+  S* correction;         // correction for moving walls
+  pop_index_t* missing;  // population to be reconstructed
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/boundary.h
+#include "kernel/bouzidi.h"
+#+END_SRC
+
+** Prescribed equilibrium
+One way of modeling an open boundary of our simulation domain is to prescribe either the velocity or the density at the wall cell.
+To realize this prescription we have to set the missing populations accordingly. The simplest way to that is to set all populations
+of the wall cell to the equilibrium values given by velocity and density.
+i.e. we have to recover the density if we are given the wall-normal velocity and vice versa.
+
+To do this we will use SymPy for solving the missing moment for a set of unknown populations and the prescribed boundary
+condition. As the =solve= function doesn't seem to work with the =Indexed= type we used to represent the population values we
+need helper methods for converting between indexed symbols and an array of plain symbols.
+
+#+BEGIN_SRC python :session :results none
+def replace_symarray_with_indexed(expr, arr, idx):
+    return expr.replace(lambda expr: expr.is_Symbol and expr in list(arr),
+                        lambda i: idx[list(arr).index(i)])
+#+END_SRC
+
+The prescribed and recovered moments will get an underscore =w= to distinguish them from normal population moments.
+
+#+BEGIN_SRC python :session :results none
+rho_w, u_w = symbols('rho_w, u_w')
+#+END_SRC
+
+As we have four respectively six possible axis-orthogonal inflow walls we want to package the rho solution into a reusable
+function that takes the wall normal as input.
+
+*** Velocity boundary
+#+BEGIN_SRC python :session :results none
+def recover_rho_w(D, c_w):
+    f_raw  = symarray('f', D.q)
+    
+    wall_normal_idx = next(i for i, c_w_i in enumerate(c_w) if c_w_i != 0)
+    
+    f_raw_rho = realize_indexed(realize_rho(D), f, f_raw)
+    f_raw_u = [ realize_indexed(realize_u_i(D, i), f, f_raw) for i in range(D.d) ]
+    
+    rho_w_def = Eq(rho_w, f_raw_rho.rhs.doit())
+    
+    missing_c = filter(lambda c_i: c_i[wall_normal_idx] != 0 and c_i[wall_normal_idx] == c_w[wall_normal_idx], D.c)
+    missing_pops = [ f_raw[i] for i in [ D.c.index(c_i) for c_i in missing_c ] ]
+    
+    missing_pops_given_by_rho_w = solve(rho_w_def, sum(missing_pops))
+    missing_pops_given_by_rho_w = next(s for s in missing_pops_given_by_rho_w if s != 0)
+    
+    u_w_def = Eq(rho_w * u_w, rho_w_def.rhs * f_raw_u[wall_normal_idx].rhs)
+    missing_pops_given_by_u_w = solve(u_w_def, sum(missing_pops))
+    missing_pops_given_by_u_w = next(s for s in missing_pops_given_by_u_w if s != 0)
+    
+    missing_pops_solution = solve(Eq(missing_pops_given_by_rho_w, missing_pops_given_by_u_w), rho_w, minimal=True)
+    missing_pops_solution = next(s for s in missing_pops_solution if s != 0)
+    
+    return Assignment(rho_w, replace_symarray_with_indexed(missing_pops_solution, f_raw, f))
+#+END_SRC
+
+This function simply constructs two definitions for the set of missing populations using either the wall velocity or the value of rho.
+As these definitions must be equal in a valid system we can solve them for the desired reconstruction of rho.
+
+#+BEGIN_SRC python :session :results output :wrap latex
+D = descriptor[lattice]
+printlatexpr(recover_rho_w(D, [0,1]))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+\rho_{w} &:= - \frac{2 {f}_{0} + {f}_{1} + 2 {f}_{3} + {f}_{4} + 2 {f}_{6} + {f}_{7}}{u_{w} - 1} \\
+\end{align*}$$
+#+end_latex
+
+#+BEGIN_SRC python :session :results none
+def recover_rho_code_block(D, populations, normal):
+    rho_def = recover_rho_w(D, normal).subs(rho_w, rho)
+    return CodeBlock(realize_indexed(rho_def, f, populations))
+#+END_SRC
+
+#+NAME: recover-rho-using-wall-velocity
+#+BEGIN_SRC python :session :results output :var normal='(1 0)
+D = descriptor[lattice]
+printcode(recover_rho_code_block(D, IndexedBase('f_curr', q), normal))
+#+END_SRC
+
+#+RESULTS: recover-rho-using-wall-velocity
+: T rho = -(T{2.00000000000000}*f_curr[0] + T{2.00000000000000}*f_curr[1] + T{2.00000000000000}*f_curr[2] + f_curr[3] + f_curr[4] + f_curr[5])/(u_w + T{-1.00000000000000});
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/equilibrium_velocity_wall.h
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/wall.h>
+#include <LLBM/descriptor.h>
+
+struct EquilibriumVelocityWallO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T u_w, WallNormal<1,0>) {
+  <<recover-rho-using-wall-velocity(lattice="D2Q9", normal='(1 0))>>
+  T u_0 = u_w;
+  T u_1 = 0.;
+  <<equilibrium-from-moments(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T u_w, WallNormal<-1,0>) {
+  <<recover-rho-using-wall-velocity(lattice="D2Q9",normal='(-1 0))>>
+  T u_0 = u_w;
+  T u_1 = 0;
+  <<equilibrium-from-moments(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T u_w, WallNormal<1,0,0>) {
+  <<recover-rho-using-wall-velocity(lattice="D3Q19", normal='(1 0 0))>>
+  T u_0 = u_w;
+  T u_1 = 0;
+  T u_2 = 0;
+  <<equilibrium-from-moments(lattice="D3Q19")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T u_w, WallNormal<-1,0,0>) {
+  <<recover-rho-using-wall-velocity(lattice="D3Q19", normal='(-1 0 0))>>
+  T u_0 = u_w;
+  T u_1 = 0;
+  T u_2 = 0;
+  <<equilibrium-from-moments(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/boundary.h
+#include "kernel/equilibrium_velocity_wall.h"
+#+END_SRC
+
+*** Density boundary
+#+BEGIN_SRC python :session :results none
+def recover_u_w(D, c_w):
+    f_raw  = symarray('f', D.q)
+    
+    wall_normal_idx = next(i for i, c_w_i in enumerate(c_w) if c_w_i != 0)
+    
+    f_raw_rho = realize_indexed(realize_rho(D), f, f_raw)
+    f_raw_u = realize_indexed(realize_u_i(D, wall_normal_idx), f, f_raw)
+    
+    rho_w_def = Eq(rho_w, f_raw_rho.rhs.doit())
+    
+    missing_c = list(filter(lambda c_i: c_i[wall_normal_idx] != 0 and c_i[wall_normal_idx] == c_w[wall_normal_idx], D.c))
+    missing_pops = [ f_raw[i] for i in [ D.c.index(c_i) for c_i in missing_c ] ]
+    
+    missing_pops_given_by_rho_w = solve(rho_w_def, sum(missing_pops))
+    missing_pops_given_by_rho_w = next(s for s in missing_pops_given_by_rho_w if s != 0)
+    
+    u_w_def = Eq(rho_w * u_w, rho_w_def.rhs * f_raw_u.rhs)
+    missing_pops_given_by_u_w = solve(u_w_def, sum(missing_pops))
+    missing_pops_given_by_u_w = next(s for s in missing_pops_given_by_u_w if s != 0)
+    
+    missing_pops_solution = solve(Eq(missing_pops_given_by_rho_w, missing_pops_given_by_u_w), u_w, minimal=True)
+    missing_pops_solution = next(s for s in missing_pops_solution if s != 0)
+    
+    return Assignment(u_w, replace_symarray_with_indexed(missing_pops_solution, f_raw, f))
+#+END_SRC
+
+The only difference between this function and the previous one is that we solve for the wall-normal velocity instead of for the wall density.
+
+#+BEGIN_SRC python :session :results output :wrap latex
+D = descriptor[lattice]
+printlatexpr(recover_u_w(D, [1,0]))
+#+END_SRC
+
+#+RESULTS:
+#+begin_latex
+$$\begin{align*}
+u_{w} &:= \frac{\rho_{w} - 2 {f}_{0} - 2 {f}_{1} - 2 {f}_{2} - {f}_{3} - {f}_{4} - {f}_{5}}{\rho_{w}} \\
+\end{align*}$$
+#+end_latex
+
+#+BEGIN_SRC python :session :results none
+def recover_u_code_block(D, populations, normal):
+    u_def = recover_u_w(D, normal).subs(u_w, Symbol('u'))
+    return CodeBlock(realize_indexed(u_def, f, populations))
+#+END_SRC
+
+#+NAME: recover-u-using-wall-density
+#+BEGIN_SRC python :session :results output :var normal='(0 1)
+D = descriptor[lattice]
+printcode(recover_u_code_block(D, IndexedBase('f_curr', q), normal))
+#+END_SRC
+
+#+RESULTS: recover-u-using-wall-density
+: T u = (rho_w - T{2.00000000000000}*f_curr[0] - f_curr[1] - T{2.00000000000000}*f_curr[3] - f_curr[4] - T{2.00000000000000}*f_curr[6] - f_curr[7])/rho_w;
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/equilibrium_density_wall.h
+#pragma once
+#include <LLBM/call_tag.h>
+#include <LLBM/wall.h>
+#include <LLBM/descriptor.h>
+
+struct EquilibriumDensityWallO {
+
+using call_tag = tag::call_by_cell_id;
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T rho_w, WallNormal<1,0>) {
+  <<recover-u-using-wall-density(lattice="D2Q9", normal='(1 0))>>
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  <<equilibrium-from-moments(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D2Q9, S f_curr[9], S f_next[9], std::size_t gid, T rho_w, WallNormal<-1,0>) {
+  <<recover-u-using-wall-density(lattice="D2Q9",normal='(-1 0))>>
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  <<equilibrium-from-moments(lattice="D2Q9")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T rho_w, WallNormal<1,0,0>) {
+  <<recover-u-using-wall-density(lattice="D3Q19", normal='(1 0 0))>>
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  T u_2 = 0.;
+  <<equilibrium-from-moments(lattice="D3Q19")>>
+}
+
+template <typename T, typename S>
+__device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std::size_t gid, T rho_w, WallNormal<-1,0,0>) {
+  <<recover-u-using-wall-density(lattice="D3Q19", normal='(-1 0 0))>>
+  T rho = rho_w;
+  T u_0 = u;
+  T u_1 = 0.;
+  T u_2 = 0.;
+  <<equilibrium-from-moments(lattice="D3Q19")>>
+}
+
+};
+#+END_SRC
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/boundary.h
+#include "kernel/equilibrium_density_wall.h"
+#+END_SRC
+
+* Propagation pattern
+Up until now the symbolic expressions and the generated code did not explicitly implement the
+second essential part of the LBM algorithm: propagation. Rather the propagation was modelled
+abstractly by reading from some population =f_curr= and writing to another population =f_next=.
+To remedy this we will now describe how =f_curr= and =f_next= are actually represented in memory.
+This representation will then allow /implicit/ propagation by changing the pointers that are used
+to access it.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+#pragma once
+
+#include "memory.h"
+#include "descriptor.h"
+#include "kernel/propagate.h"
+
+#include <cuda.h>
+#+END_SRC
+
+Our code employs the /Periodic Shift (PS)/ propagation pattern to perform the streaming step of
+the LB algorithm. This pattern uses a /Structure of Arrays/ data layout for the populations where
+each individual array is viewed as cyclic. The Sweep space filling curve is used as the bijection
+between these one dimensional arrays and spatial cell locations. 
+As the distance between any two cells along some fixed vector is invariant of the specific cell
+locations propagation is equivalent to rotating the population arrays. Such rotation can be
+implemented without data transfer by shifting the start pointers in a control structure.
+
+The control structure describes the mapping between cells and memory locations for
+a specific point in time. We group all neccessary data into a /LatticeView/ structure.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+template <typename DESCRIPTOR, typename S>
+struct LatticeView {
+  const descriptor::Cuboid<DESCRIPTOR> cuboid;
+  S** population;
+
+  __device__ __forceinline__
+  S* pop(pop_index_t iPop, std::size_t gid) const;
+};
+#+END_SRC
+
+This lightweight structure will be passed by-value to any kernel functions and is the only way for
+collision operators, functors and boundary conditions to access lattice data.
+
+** Memory
+As the population memory layout and propagation algorithm are codependent we implement
+them in a single /CyclicPopulationBuffer/ class. This class will manage the \(q\)  individual device-side
+cyclic arrays together with their control structure.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+template <typename DESCRIPTOR, typename S>
+class CyclicPopulationBuffer {
+protected:
+  const descriptor::Cuboid<DESCRIPTOR> _cuboid;
+
+  const std::size_t _page_size;
+  const std::size_t _volume;
+
+  CUmemGenericAllocationHandle _handle[DESCRIPTOR::q];
+  CUmemAllocationProp _prop{};
+  CUmemAccessDesc _access{};
+  CUdeviceptr _ptr;
+
+  SharedVector<S*> _base;
+  SharedVector<S*> _population;
+
+  S* device() {
+    return reinterpret_cast<S*>(_ptr);
+  }
+
+public:
+  CyclicPopulationBuffer(descriptor::Cuboid<DESCRIPTOR> cuboid);
+
+  LatticeView<DESCRIPTOR,S> view() {
+    return LatticeView<DESCRIPTOR,S>{ _cuboid, _population.device() };
+  }
+
+  void stream();
+
+};
+#+END_SRC
+
+In order to enable rotation of cyclic arrays by shifting only the start pointer in /LatticeView/ we need
+to perform the index wrapping at the end of the physical array as efficiently as possible. In turns
+out that this can be done at virtually no cost by using the in-hardware virtual address translation
+logic. Doing so requires the array sizes to be exact multiples of the device page size.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+std::size_t getDevicePageSize(int device_id=-1) {
+  if (device_id == -1) {
+    cudaGetDevice(&device_id);
+  }
+  std::size_t granularity = 0;
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = device_id;
+  cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+  return granularity;
+}
+#+END_SRC
+
+The concrete page size value which is 2 MiB on current Nvidia GPUs can now be used to round the
+in-memory size of each population array to the nearest page boundary.
+
+** Initialization
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+template <typename DESCRIPTOR, typename S>
+CyclicPopulationBuffer<DESCRIPTOR,S>::CyclicPopulationBuffer(
+  descriptor::Cuboid<DESCRIPTOR> cuboid):
+  _cuboid(cuboid),
+  _page_size{getDevicePageSize()},
+  _volume{((cuboid.volume * sizeof(S) - 1) / _page_size + 1) * _page_size},
+  _base(DESCRIPTOR::q),
+  _population(DESCRIPTOR::q)
+{
+#+END_SRC
+
+After calculating the page-aligned memory size and constructing two vectors of population
+pointers for the control strucuture we are ready to place two consecutive views of the same
+physical array in virtual memory.
+
+To do this we first need to know which device is currently selected.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+  int device_id = -1;
+  cudaGetDevice(&device_id);
+#+END_SRC
+
+Using this device ID, a device-pinned address area large enough to fit two full views of the
+lattice can be reserved.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+  _prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  _prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  _prop.location.id = device_id;
+  cuMemAddressReserve(&_ptr, 2 * _volume * DESCRIPTOR::q, 0, 0, 0);
+#+END_SRC
+
+The /structure of cyclic arrays/ required for our chosen propagation pattern is then
+mapped into this address area.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    // per-population handle until cuMemMap accepts non-zero offset
+    cuMemCreate(&_handle[iPop], _volume, &_prop, 0); 
+    cuMemMap(_ptr + iPop * 2 * _volume,           _volume, 0, _handle[iPop], 0);
+    cuMemMap(_ptr + iPop * 2 * _volume + _volume, _volume, 0, _handle[iPop], 0);
+  }
+#+END_SRC
+
+Actually reading from and writing to locations within this memory depends on setting
+the correct access flags. Once this is done we are ready to zero-initialize the buffer.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+  _access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  _access.location.id = 0;
+  _access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  cuMemSetAccess(_ptr, 2 * _volume * DESCRIPTOR::q, &_access, 1);
+  cuMemsetD8(_ptr, 0, 2 * _volume * DESCRIPTOR::q);
+#+END_SRC
+
+As the rotation of the cyclic arrays is to be realized by shifting the per-population start pointers
+we also need to store those somewhere.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    _base[iPop] = device() + iPop * 2 * (_volume / sizeof(S));
+    _population[iPop] = _base[iPop] + iPop * ((_volume / sizeof(S)) / DESCRIPTOR::q);
+  }
+
+  _base.syncDeviceFromHost();
+  _population.syncDeviceFromHost();
+}
+#+END_SRC
+
+** Access
+The common interface of most of out GPU kernels is to accept an array of current propulations and
+write the new populations to another array. This way we can control where the populations are read
+from and stored to at a central location.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h
+template <typename DESCRIPTOR, typename S>
+__device__ __forceinline__
+S* LatticeView<DESCRIPTOR,S>::pop(pop_index_t iPop, std::size_t gid) const {
+  return population[iPop] + gid;
+}
+#+END_SRC
+
+In practice a slight performance improvement can be observed on some GPUs when only evaluating
+this addition once per-kernel and caching the resulting locations.
+
+#+NAME: read-f-curr
+#+BEGIN_SRC cpp
+S* preshifted_f[DESCRIPTOR::q];
+for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+  preshifted_f[iPop] = lattice.pop(iPop, gid);
+  f_curr[iPop] = *preshifted_f[iPop];
+}
+#+END_SRC
+
+At this point the various kernel functions can execute a generic operator on a cell's
+populations without knowing anything about where the cell data is stored.
+
+The preshifted pointers are then reused to perform the store operations after
+the generic operator implementation is done with its work.
+
+#+NAME: write-f-next
+#+BEGIN_SRC cpp
+for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+  *preshifted_f[iPop] = f_next[iPop];
+}
+#+END_SRC
+
+** Update
+#+BEGIN_SRC cpp :tangle tangle/LLBM/propagate.h :noweb no
+template <typename DESCRIPTOR, typename S>
+void CyclicPopulationBuffer<DESCRIPTOR,S>::stream() {
+  propagate<DESCRIPTOR,S><<<1,1>>>(view(), _base.device(), _volume / sizeof(S));
+}
+#+END_SRC
+
+The =propagate= kernel shifts the start pointers of each population array by the respective
+discrete velocity offset and performs wrapping if necessary. This ensures that the actual
+population accesses are always presented a contiguous view of the full array.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/propagate.h
+#pragma once
+
+template <typename DESCRIPTOR, typename S>
+class LatticeView;
+
+template <typename DESCRIPTOR, typename S>
+__global__ void propagate(LatticeView<DESCRIPTOR,S> lattice, S** base, std::size_t size) {
+#+END_SRC
+
+It is very important to use the correct types when doing pointer arithmetic.
+
+Rotation is performed by shifting the start position of each population array by the invariant
+neighborhood distance given by its discrete velocity vector. As this operation can cross the
+array boundaries special care has to be taken in wrapping these invalid positions back into
+the array.
+
+#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/propagate.h
+  for (unsigned iPop=0; iPop < DESCRIPTOR::q; ++iPop) {
+    std::ptrdiff_t shift = -descriptor::offset<DESCRIPTOR>(lattice.cuboid, iP