From a92271176a19e06611099c0eccc4e6a6887f4915 Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Mon, 17 May 2021 00:30:13 +0200
Subject: Extract public version of SweepLB

---
 .editorconfig                        |   2 +
 CMakeLists.txt                       |  72 +++++++
 box.cpp                              |  73 +++++++
 channel.cpp                          |  87 ++++++++
 ldc.cpp                              |  76 +++++++
 shell.nix                            |  15 ++
 src/LLBM/bounce_back.h               |  30 +++
 src/LLBM/bounce_back_moving_wall.h   |  30 +++
 src/LLBM/collect_moments.h           |  25 +++
 src/LLBM/collide.h                   |  89 ++++++++
 src/LLBM/equilibrium_density_wall.h  | 144 +++++++++++++
 src/LLBM/equilibrium_velocity_wall.h | 144 +++++++++++++
 src/LLBM/initialize.h                |  53 +++++
 src/LLBM/smagorinsky_collide.h       | 129 ++++++++++++
 src/LLBM/wall.h                      |   4 +
 src/concepts.h                       |  22 ++
 src/cuboid.h                         |  37 ++++
 src/export.h                         |  59 ++++++
 src/lattice.h                        | 267 ++++++++++++++++++++++++
 src/operator.h                       |  41 ++++
 src/pattern/all.h                    |   5 +
 src/pattern/none.h                   |  48 +++++
 src/pattern/ps.h                     | 123 ++++++++++++
 src/pattern/sss.h                    |  63 ++++++
 src/population.h                     |  41 ++++
 src/propagation.h                    |  28 +++
 src/simd/256.h                       | 379 +++++++++++++++++++++++++++++++++++
 src/simd/512.h                       | 339 +++++++++++++++++++++++++++++++
 28 files changed, 2425 insertions(+)
 create mode 100644 .editorconfig
 create mode 100644 CMakeLists.txt
 create mode 100644 box.cpp
 create mode 100644 channel.cpp
 create mode 100644 ldc.cpp
 create mode 100644 shell.nix
 create mode 100644 src/LLBM/bounce_back.h
 create mode 100644 src/LLBM/bounce_back_moving_wall.h
 create mode 100644 src/LLBM/collect_moments.h
 create mode 100644 src/LLBM/collide.h
 create mode 100644 src/LLBM/equilibrium_density_wall.h
 create mode 100644 src/LLBM/equilibrium_velocity_wall.h
 create mode 100644 src/LLBM/initialize.h
 create mode 100644 src/LLBM/smagorinsky_collide.h
 create mode 100644 src/LLBM/wall.h
 create mode 100644 src/concepts.h
 create mode 100644 src/cuboid.h
 create mode 100644 src/export.h
 create mode 100644 src/lattice.h
 create mode 100644 src/operator.h
 create mode 100644 src/pattern/all.h
 create mode 100644 src/pattern/none.h
 create mode 100644 src/pattern/ps.h
 create mode 100644 src/pattern/sss.h
 create mode 100644 src/population.h
 create mode 100644 src/propagation.h
 create mode 100644 src/simd/256.h
 create mode 100644 src/simd/512.h

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..0b225af
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,2 @@
+[*]
+indent_style = tab
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..c5b383b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,72 @@
+cmake_minimum_required(VERSION 3.10)
+project(SweepLB LANGUAGES CXX)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(SIMD_MODE "AVX256" CACHE STRING "Generate either AVX256 or AVX512 instructions")
+option(USE_SIMD_CELL_LIST "Generate gather / scatter instructions for cell list processing" OFF)
+
+if(NOT SIMD_MODE)
+	set(SIMD_MODE "AVX256")
+endif()
+
+set(CXX_FLAGS "-O3 -march=native -mtune=native")
+
+if(${SIMD_MODE} MATCHES "AVX256")
+	message("Selecting AVX256")
+	set(SIMD_MODE_FLAGS "-mavx2")
+elseif(${SIMD_MODE} MATCHES "AVX512")
+	message("Selecting AVX512")
+	add_compile_definitions(ENABLE_AVX512)
+	set(SIMD_MODE_FLAGS "-mavx512f -mavx512dq")
+else()
+  message(FATAL_ERROR "Invalid SIMD mode, choose either AVX256 or AVX512")
+endif()
+
+if(USE_SIMD_CELL_LIST)
+  message("Enabling SIMD cell list processing.")
+  add_compile_definitions(SIMD_CELL_LIST)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CXX_FLAGS} ${SIMD_MODE_FLAGS}")
+
+message("Using flags \"${CMAKE_CXX_FLAGS}\"")
+
+include_directories(
+  src/
+)
+
+find_package(OpenMP)
+
+set(EXAMPLES ldc box channel)
+set(PRECISIONS float double)
+set(PATTERNS sss ps)
+
+foreach(example IN LISTS EXAMPLES)
+	foreach(precision IN LISTS PRECISIONS)
+		foreach(pattern IN LISTS PATTERNS)
+			set(generated_name ${example}_${precision}_${pattern})
+			string(TOUPPER ${pattern} pattern)
+			add_executable(${generated_name} ${example}.cpp)
+			target_compile_definitions(${generated_name}
+				PRIVATE
+				SWEEPLB_PRECISION=${precision}
+				SWEEPLB_PATTERN=${pattern})
+			target_compile_features(${generated_name}
+				PRIVATE cxx_std_20)
+			target_link_libraries(
+				${generated_name}
+				PUBLIC
+				  rt)
+			if(OpenMP_CXX_FOUND)
+				target_link_libraries(
+					${generated_name}
+					PUBLIC
+					  OpenMP::OpenMP_CXX
+				)
+			endif()
+	  endforeach()
+  endforeach()
+endforeach()
diff --git a/box.cpp b/box.cpp
new file mode 100644
index 0000000..ecb3296
--- /dev/null
+++ b/box.cpp
@@ -0,0 +1,73 @@
+#include "lattice.h"
+
+#include "LLBM/collide.h"
+#include "LLBM/initialize.h"
+#include "LLBM/bounce_back.h"
+
+#include <chrono>
+#include <iostream>
+
+#include "pattern/all.h"
+
+using T = SWEEPLB_PRECISION;
+using PATTERN = pattern::SWEEPLB_PATTERN<T>;
+
+void simulate(Cuboid cuboid, std::size_t nStep) {
+	const int nThread = omp_get_max_threads();
+
+	Lattice<PATTERN> lattice(cuboid);
+
+	std::vector<simd::Pack<T>::index_t> discontinuity;
+
+	LatticeMask<T> bulk_mask(lattice.volume());
+	LatticeMask<T> wall_mask(lattice.volume());
+
+	cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) {
+		if (std::abs(iX - cuboid[0]/2) < cuboid[0]/16 &&
+		    std::abs(iY - cuboid[1]/2) < cuboid[1]/16 &&
+		    std::abs(iZ - cuboid[2]/2) < cuboid[2]/16) {
+			discontinuity.emplace_back(iCell);
+		}
+		if (   iX > 0 && iX < cuboid[0]-1
+		    && iY > 0 && iY < cuboid[1]-1
+		    && iZ > 0 && iZ < cuboid[2]-1) {
+			bulk_mask.set(iCell, true);
+		} else {
+			wall_mask.set(iCell, true);
+		}
+	});
+
+	bulk_mask.serialize();
+	wall_mask.serialize();
+
+	lattice.apply<InitializeO>(discontinuity, 2.0);
+
+	T tau = 0.56;
+
+	auto start = std::chrono::steady_clock::now();
+
+	for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), wall_mask));
+		lattice.stream();
+	}
+
+	auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+		std::chrono::steady_clock::now() - start);
+
+	std::cout << cuboid[0]
+	          << ", " << nStep
+	          << ", " << nThread
+	          << ", " << (nStep * lattice.volume()) / (1e6 * duration.count())
+	          << std::endl;
+
+	lattice.write_momenta(bulk_mask, "result.vtk");
+}
+
+int main(int argc, char* argv[]) {
+	const std::size_t nX     = atoi(argv[1]);
+	const std::size_t nY     = atoi(argv[2]);
+	const std::size_t nZ     = atoi(argv[3]);
+	const std::size_t steps = atoi(argv[4]);
+	simulate({ nX, nY, nZ }, steps);
+}
diff --git a/channel.cpp b/channel.cpp
new file mode 100644
index 0000000..fddb9a1
--- /dev/null
+++ b/channel.cpp
@@ -0,0 +1,87 @@
+#include "lattice.h"
+
+#include "LLBM/collide.h"
+#include "LLBM/initialize.h"
+#include "LLBM/bounce_back.h"
+#include "LLBM/bounce_back_moving_wall.h"
+#include "LLBM/equilibrium_density_wall.h"
+#include "LLBM/equilibrium_velocity_wall.h"
+
+#include <chrono>
+#include <iostream>
+
+#include "pattern/all.h"
+
+using T = SWEEPLB_PRECISION;
+using PATTERN = pattern::SWEEPLB_PATTERN<T>;
+
+void simulate(Cuboid cuboid, std::size_t nStep) {
+	const int nThread = omp_get_max_threads();
+
+	Lattice<PATTERN> lattice(cuboid);
+
+	LatticeMask<T> bulk_mask(cuboid.volume());
+	LatticeMask<T> wall_mask(cuboid.volume());
+	LatticeMask<T> inflow_mask(cuboid.volume());
+	LatticeMask<T> outflow_mask(cuboid.volume());
+
+	cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) {
+		if (   iY == 0 || iY == cuboid[1]-1
+		    || iZ == 0 || iZ == cuboid[2]-1) {
+			wall_mask.set(iCell, true);
+		} else if (iX == 0) {
+			inflow_mask.set(iCell, true);
+		} else if (iX == cuboid[0]-1) {
+			outflow_mask.set(iCell, true);
+		} else {
+			bulk_mask.set(iCell, true);
+		}
+	});
+
+	bulk_mask.serialize();
+	wall_mask.serialize();
+	inflow_mask.serialize();
+	outflow_mask.serialize();
+
+	T tau = 0.56;
+
+	T u_inflow  = 0.05;
+	T d_outflow = 1.;
+
+	for (std::size_t iStep = 0; iStep < 100; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), wall_mask),
+		              Operator(EquilibriumVelocityWallO(), inflow_mask, u_inflow, WallNormal<1,0,0>()),
+		              Operator(EquilibriumDensityWallO(), outflow_mask, d_outflow, WallNormal<-1,0,0>()));
+		lattice.stream();
+	}
+
+	auto start = std::chrono::steady_clock::now();
+
+	for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), wall_mask),
+		              Operator(EquilibriumVelocityWallO(), inflow_mask, u_inflow, WallNormal<1,0,0>()),
+		              Operator(EquilibriumDensityWallO(), outflow_mask, d_outflow, WallNormal<-1,0,0>()));
+		lattice.stream();
+	}
+
+	auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+	  std::chrono::steady_clock::now() - start);
+
+	std::cout << cuboid[0] << ", " << cuboid[1] << ", " << cuboid[2]
+	          << ", " << nStep
+	          << ", " << nThread
+	          << ", " << (nStep * lattice.volume()) / (1e6 * duration.count())
+	          << std::endl;
+
+	lattice.write_momenta(bulk_mask, "result.vtk");
+}
+
+int main(int argc, char* argv[]) {
+	const std::size_t nX    = atoi(argv[1]);
+	const std::size_t nY    = atoi(argv[2]);
+	const std::size_t nZ    = atoi(argv[3]);
+	const std::size_t steps = atoi(argv[4]);
+	simulate({ nX, nY, nZ }, steps);
+}
diff --git a/ldc.cpp b/ldc.cpp
new file mode 100644
index 0000000..0917939
--- /dev/null
+++ b/ldc.cpp
@@ -0,0 +1,76 @@
+#include "lattice.h"
+
+#include "LLBM/collide.h"
+#include "LLBM/initialize.h"
+#include "LLBM/bounce_back.h"
+#include "LLBM/bounce_back_moving_wall.h"
+
+#include <chrono>
+#include <iostream>
+
+#include "pattern/all.h"
+
+using T = SWEEPLB_PRECISION;
+using PATTERN = pattern::SWEEPLB_PATTERN<T>;
+
+void simulate(Cuboid cuboid, std::size_t nStep) {
+	const int nThread = omp_get_max_threads();
+
+	Lattice<PATTERN> lattice(cuboid);
+
+	LatticeMask<T> bulk_mask(cuboid.volume());
+	LatticeMask<T> box_mask(cuboid.volume());
+	LatticeMask<T> lid_mask(cuboid.volume());
+
+	cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) {
+		if (iZ == cuboid[2]-1) {
+			lid_mask.set(iCell, true);
+		} else if (iX == 0 || iX == cuboid[0]-1
+			      || iY == 0 || iY == cuboid[1]-1
+			      || iZ == 0) {
+			box_mask.set(iCell, true);
+		} else {
+			bulk_mask.set(iCell, true);
+		}
+	});
+
+	bulk_mask.serialize();
+	box_mask.serialize();
+	lid_mask.serialize();
+
+	T tau = 0.51;
+	T u_lid[] { 0.05, 0., 0. };
+
+	for (std::size_t iStep = 0; iStep < 100; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), box_mask),
+		              Operator(BounceBackMovingWallO(), lid_mask, u_lid));
+		lattice.stream();
+	}
+
+	auto start = std::chrono::steady_clock::now();
+
+	for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), box_mask),
+		              Operator(BounceBackMovingWallO(), lid_mask, u_lid));
+		lattice.stream();
+	}
+
+	auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+		std::chrono::steady_clock::now() - start);
+
+	std::cout << cuboid[0]
+	          << ", " << nStep
+	          << ", " << nThread
+	          << ", " << (nStep * lattice.volume()) / (1e6 * duration.count())
+	          << std::endl;
+
+	lattice.write_momenta(bulk_mask, "result.vtk");
+}
+
+int main(int argc, char* argv[]) {
+	const std::size_t n     = atoi(argv[1]);
+	const std::size_t steps = atoi(argv[2]);
+	simulate({ n, n, n}, steps);
+}
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000..d156272
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,15 @@
+{ pkgs ? import <nixpkgs> { }, ... }:
+
+pkgs.stdenvNoCC.mkDerivation rec {
+  name = "sweeplb-env";
+  env = pkgs.buildEnv { name = name; paths = buildInputs; };
+
+  buildInputs = with pkgs; [
+    gcc10
+    cmake
+  ];
+
+  shellHook = ''
+    export NIX_SHELL_NAME="${name}"
+  '';
+}
diff --git a/src/LLBM/bounce_back.h b/src/LLBM/bounce_back.h
new file mode 100644
index 0000000..b710ee4
--- /dev/null
+++ b/src/LLBM/bounce_back.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "concepts.h"
+
+struct BounceBackO {
+
+template <concepts::Arithmetic V>
+static void apply(V f_curr[19], V f_next[19]) {
+	f_next[0] = f_curr[18];
+	f_next[1] = f_curr[17];
+	f_next[2] = f_curr[16];
+	f_next[3] = f_curr[15];
+	f_next[4] = f_curr[14];
+	f_next[5] = f_curr[13];
+	f_next[6] = f_curr[12];
+	f_next[7] = f_curr[11];
+	f_next[8] = f_curr[10];
+	f_next[9] = f_curr[9];
+	f_next[10] = f_curr[8];
+	f_next[11] = f_curr[7];
+	f_next[12] = f_curr[6];
+	f_next[13] = f_curr[5];
+	f_next[14] = f_curr[4];
+	f_next[15] = f_curr[3];
+	f_next[16] = f_curr[2];
+	f_next[17] = f_curr[1];
+	f_next[18] = f_curr[0];
+}
+
+};
diff --git a/src/LLBM/bounce_back_moving_wall.h b/src/LLBM/bounce_back_moving_wall.h
new file mode 100644
index 0000000..c00a17e
--- /dev/null
+++ b/src/LLBM/bounce_back_moving_wall.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "concepts.h"
+
+struct BounceBackMovingWallO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T u[3]) {
+	f_next[0] = f_curr[18] + T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2];
+	f_next[1] = f_curr[17] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2];
+	f_next[2] = f_curr[16] + T{0.333333333333333}*u[2];
+	f_next[3] = f_curr[15] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2];
+	f_next[4] = f_curr[14] - T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2];
+	f_next[5] = f_curr[13] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1];
+	f_next[6] = f_curr[12] + T{0.333333333333333}*u[1];
+	f_next[7] = f_curr[11] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1];
+	f_next[8] = f_curr[10] - T{0.333333333333333}*u[0];
+	f_next[9] = f_curr[9];
+	f_next[10] = f_curr[8] + T{0.333333333333333}*u[0];
+	f_next[11] = f_curr[7] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1];
+	f_next[12] = f_curr[6] - T{0.333333333333333}*u[1];
+	f_next[13] = f_curr[5] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1];
+	f_next[14] = f_curr[4] + T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2];
+	f_next[15] = f_curr[3] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2];
+	f_next[16] = f_curr[2] - T{0.333333333333333}*u[2];
+	f_next[17] = f_curr[1] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2];
+	f_next[18] = f_curr[0] - T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2];
+}
+
+};
diff --git a/src/LLBM/collect_moments.h b/src/LLBM/collect_moments.h
new file mode 100644
index 0000000..afd5d59
--- /dev/null
+++ b/src/LLBM/collect_moments.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "concepts.h"
+
+struct CollectMomentsF {
+
+static constexpr unsigned size = 4;
+
+template <concepts::Arithmetic V>
+static void observe(V f_curr[19], V* momenta) {
+	V m0 = f_curr[10] + f_curr[13] + f_curr[17];
+	V m1 = f_curr[14] + f_curr[5] + f_curr[6];
+	V m2 = f_curr[1] + f_curr[2] + f_curr[4];
+	V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+	V m4 = -f_curr[11] + f_curr[7];
+	V m5 = -f_curr[15] + f_curr[3];
+	V m6 = V{1}/m3;
+	V m7 = f_curr[0] - f_curr[18];
+	momenta[0] = m3;
+	momenta[1] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]);
+	momenta[2] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]);
+	momenta[3] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]);
+}
+
+};
diff --git a/src/LLBM/collide.h b/src/LLBM/collide.h
new file mode 100644
index 0000000..c08aade
--- /dev/null
+++ b/src/LLBM/collide.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "concepts.h"
+
+struct BgkCollideO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T tau) {
+	V rho;
+	V u[3];
+	V m0 = f_curr[10] + f_curr[13] + f_curr[17];
+	V m1 = f_curr[14] + f_curr[5] + f_curr[6];
+	V m2 = f_curr[1] + f_curr[2] + f_curr[4];
+	V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+	V m4 = -f_curr[11] + f_curr[7];
+	V m5 = -f_curr[15] + f_curr[3];
+	V m6 = T{1}/m3;
+	V m7 = f_curr[0] - f_curr[18];
+	rho = m3;
+	u[0] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]);
+	u[1] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]);
+	u[2] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]);
+	V x0 = T{1}/tau;
+	V x1 = T{0.0138888888888889}*x0;
+	V x2 = u[1] + u[2];
+	V x3 = T{9.00000000000000}*(x2*x2);
+	V x4 = T{6.00000000000000}*u[2];
+	V x5 = u[1]*u[1];
+	V x6 = T{3.00000000000000}*x5;
+	V x7 = -x6;
+	V x8 = u[0]*u[0];
+	V x9 = T{3.00000000000000}*x8;
+	V x10 = T{2.00000000000000} - x9;
+	V x11 = x10 + x7;
+	V x12 = x11 + x4;
+	V x13 = u[2]*u[2];
+	V x14 = T{3.00000000000000}*x13;
+	V x15 = -x14;
+	V x16 = T{6.00000000000000}*u[1];
+	V x17 = x15 + x16;
+	V x18 = T{6.00000000000000}*u[0];
+	V x19 = -u[0];
+	V x20 = x19 + u[2];
+	V x21 = x14 + x6 + T{-2.00000000000000};
+	V x22 = x21 + x9;
+	V x23 = x22 - x4;
+	V x24 = T{0.0277777777777778}*x0;
+	V x25 = T{6.00000000000000}*x13;
+	V x26 = u[0] + u[2];
+	V x27 = T{9.00000000000000}*(x26*x26);
+	V x28 = x15 + x18;
+	V x29 = -u[1];
+	V x30 = x29 + u[2];
+	V x31 = x19 + u[1];
+	V x32 = -x16;
+	V x33 = x18 + x22;
+	V x34 = T{6.00000000000000}*x5;
+	V x35 = u[0] + u[1];
+	V x36 = T{9.00000000000000}*(x35*x35);
+	V x37 = T{6.00000000000000}*x8;
+	V x38 = x9 + T{-2.00000000000000};
+	V x39 = x29 + u[0];
+	V x40 = -x18 + x22;
+	V x41 = -u[2];
+	V x42 = x41 + u[1];
+	V x43 = x22 + x4;
+	V x44 = x41 + u[0];
+	f_next[0] = x1*(rho*(x12 + x17 + x3) - T{72.0000000000000}*f_curr[0]) + f_curr[0];
+	f_next[1] = -x1*(rho*(x18 + x23 - T{9.00000000000000}*x20*x20) + T{72.0000000000000}*f_curr[1]) + f_curr[1];
+	f_next[2] = x24*(rho*(x12 + x25) - T{36.0000000000000}*f_curr[2]) + f_curr[2];
+	f_next[3] = x1*(rho*(x12 + x27 + x28) - T{72.0000000000000}*f_curr[3]) + f_curr[3];
+	f_next[4] = -x1*(rho*(x16 + x23 - T{9.00000000000000}*x30*x30) + T{72.0000000000000}*f_curr[4]) + f_curr[4];
+	f_next[5] = -x1*(rho*(x32 + x33 - T{9.00000000000000}*x31*x31) + T{72.0000000000000}*f_curr[5]) + f_curr[5];
+	f_next[6] = x24*(rho*(x10 + x17 + x34) - T{36.0000000000000}*f_curr[6]) + f_curr[6];
+	f_next[7] = x1*(rho*(x11 + x17 + x18 + x36) - T{72.0000000000000}*f_curr[7]) + f_curr[7];
+	f_next[8] = -x24*(rho*(x18 + x21 - x37) + T{36.0000000000000}*f_curr[8]) + f_curr[8];
+	f_next[9] = -T{0.166666666666667}*x0*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9];
+	f_next[10] = x24*(rho*(x28 + x37 + x7 + T{2.00000000000000}) - T{36.0000000000000}*f_curr[10]) + f_curr[10];
+	f_next[11] = -x1*(rho*(x16 + x33 - x36) + T{72.0000000000000}*f_curr[11]) + f_curr[11];
+	f_next[12] = -x24*(rho*(x14 + x16 - x34 + x38) + T{36.0000000000000}*f_curr[12]) + f_curr[12];
+	f_next[13] = -x1*(rho*(x16 + x40 - T{9.00000000000000}*x39*x39) + T{72.0000000000000}*f_curr[13]) + f_curr[13];
+	f_next[14] = -x1*(rho*(x32 + x43 - T{9.00000000000000}*x42*x42) + T{72.0000000000000}*f_curr[14]) + f_curr[14];
+	f_next[15] = -x1*(rho*(-x27 + x33 + x4) + T{72.0000000000000}*f_curr[15]) + f_curr[15];
+	f_next[16] = -x24*(rho*(-x25 + x38 + x4 + x6) + T{36.0000000000000}*f_curr[16]) + f_curr[16];
+	f_next[17] = -x1*(rho*(x4 + x40 - T{9.00000000000000}*x44*x44) + T{72.0000000000000}*f_curr[17]) + f_curr[17];
+	f_next[18] = -x1*(rho*(x16 - x3 + x43) + T{72.0000000000000}*f_curr[18]) + f_curr[18];
+}
+
+};
diff --git a/src/LLBM/equilibrium_density_wall.h b/src/LLBM/equilibrium_density_wall.h
new file mode 100644
index 0000000..598d59d
--- /dev/null
+++ b/src/LLBM/equilibrium_density_wall.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "wall.h"
+#include "concepts.h"
+
+struct EquilibriumDensityWallO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<1,0,0>) {
+	V u_w = (rho_w - f_curr[0] - T{2.00000000000000}*f_curr[11] - f_curr[12] - f_curr[14] - T{2.00000000000000}*f_curr[15] - f_curr[16] - f_curr[18] - T{2.00000000000000}*f_curr[1] - f_curr[2] - f_curr[4] - T{2.00000000000000}*f_curr[5] - f_curr[6] - T{2.00000000000000}*f_curr[8] - f_curr[9])/rho_w;
+	V rho = rho_w;
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<-1,0,0>) {
+	V u_w = (-rho_w + f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/rho_w;
+	V rho = rho_w;
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+};
diff --git a/src/LLBM/equilibrium_velocity_wall.h b/src/LLBM/equilibrium_velocity_wall.h
new file mode 100644
index 0000000..2d68e34
--- /dev/null
+++ b/src/LLBM/equilibrium_velocity_wall.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "wall.h"
+#include "concepts.h"
+
+struct EquilibriumVelocityWallO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<1,0,0>) {
+	V rho;
+	rho = -(f_curr[0] + T{2.00000000000000}*f_curr[11] + f_curr[12] + f_curr[14] + T{2.00000000000000}*f_curr[15] + f_curr[16] + f_curr[18] + T{2.00000000000000}*f_curr[1] + f_curr[2] + f_curr[4] + T{2.00000000000000}*f_curr[5] + f_curr[6] + T{2.00000000000000}*f_curr[8] + f_curr[9])/(u_w + T{-1.00000000000000});
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<-1,0,0>) {
+	V rho;
+	rho = (f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/(u_w + T{1.00000000000000});
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+};
diff --git a/src/LLBM/initialize.h b/src/LLBM/initialize.h
new file mode 100644
index 0000000..b361217
--- /dev/null
+++ b/src/LLBM/initialize.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "concepts.h"
+
+struct InitializeO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T = V>
+static void apply(V f_curr[19], V f_next[19]) {
+	f_next[0] = T{0.0277777777777778};
+	f_next[1] = T{0.0277777777777778};
+	f_next[2] = T{0.0555555555555556};
+	f_next[3] = T{0.0277777777777778};
+	f_next[4] = T{0.0277777777777778};
+	f_next[5] = T{0.0277777777777778};
+	f_next[6] = T{0.0555555555555556};
+	f_next[7] = T{0.0277777777777778};
+	f_next[8] = T{0.0555555555555556};
+	f_next[9] = T{0.333333333333333};
+	f_next[10] = T{0.0555555555555556};
+	f_next[11] = T{0.0277777777777778};
+	f_next[12] = T{0.0555555555555556};
+	f_next[13] = T{0.0277777777777778};
+	f_next[14] = T{0.0277777777777778};
+	f_next[15] = T{0.0277777777777778};
+	f_next[16] = T{0.0555555555555556};
+	f_next[17] = T{0.0277777777777778};
+	f_next[18] = T{0.0277777777777778};
+}
+
+template <concepts::Arithmetic V, concepts::Arithmetic T = V>
+static void apply(V f_curr[19], V f_next[19], T scale) {
+	f_next[0]  = T{scale * 0.0277777777777778};
+	f_next[1]  = T{scale * 0.0277777777777778};
+	f_next[2]  = T{scale * 0.0555555555555556};
+	f_next[3]  = T{scale * 0.0277777777777778};
+	f_next[4]  = T{scale * 0.0277777777777778};
+	f_next[5]  = T{scale * 0.0277777777777778};
+	f_next[6]  = T{scale * 0.0555555555555556};
+	f_next[7]  = T{scale * 0.0277777777777778};
+	f_next[8]  = T{scale * 0.0555555555555556};
+	f_next[9]  = T{scale * 0.333333333333333};
+	f_next[10] = T{scale * 0.0555555555555556};
+	f_next[11] = T{scale * 0.0277777777777778};
+	f_next[12] = T{scale * 0.0555555555555556};
+	f_next[13] = T{scale * 0.0277777777777778};
+	f_next[14] = T{scale * 0.0277777777777778};
+	f_next[15] = T{scale * 0.0277777777777778};
+	f_next[16] = T{scale * 0.0555555555555556};
+	f_next[17] = T{scale * 0.0277777777777778};
+	f_next[18] = T{scale * 0.0277777777777778};
+}
+
+};
diff --git a/src/LLBM/smagorinsky_collide.h b/src/LLBM/smagorinsky_collide.h
new file mode 100644
index 0000000..1a90598
--- /dev/null
+++ b/src/LLBM/smagorinsky_collide.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "concepts.h"
+
+struct SmagorinskyBgkCollideO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T tau, T smagorinsky) {
+	V rho;
+	V u[3];
+	V x0 = f_curr[10] + f_curr[13] + f_curr[17];
+	V x1 = f_curr[14] + f_curr[5] + f_curr[6];
+	V x2 = f_curr[1] + f_curr[2] + f_curr[4];
+	V x3 = x0 + x1 + x2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+	V x4 = -f_curr[11] + f_curr[7];
+	V x5 = -f_curr[15] + f_curr[3];
+	V x6 = T{1}/x3;
+	V x7 = f_curr[0] - f_curr[18];
+	V x8 = T{2.00000000000000}*tau + T{-1.00000000000000};
+	V x9 = T{72.0000000000000}*f_curr[5];
+	V x26 = T{72.0000000000000}*f_curr[13];
+	V x33 = T{72.0000000000000}*f_curr[11];
+	V x44 = T{72.0000000000000}*f_curr[1];
+	V x50 = T{72.0000000000000}*f_curr[17];
+	V x56 = T{72.0000000000000}*f_curr[15];
+	V x64 = T{72.0000000000000}*f_curr[4];
+	V x68 = T{72.0000000000000}*f_curr[14];
+	V x72 = T{72.0000000000000}*f_curr[18];
+	rho = x3;
+	V x78 = T{2.00000000000000}*rho;
+	u[0] = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]);
+	V x12 = -u[0];
+	V x14 = T{6.00000000000000}*u[0];
+	V x17 = u[0]*u[0];
+	V x18 = T{3.00000000000000}*x17;
+	V x27 = -x14;
+	V x38 = T{2.00000000000000} - x18;
+	V x79 = T{6.00000000000000}*x17;
+	u[1] = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]);
+	V x10 = T{6.00000000000000}*u[1];
+	V x11 = -x10;
+	V x13 = x12 + u[1];
+	V x19 = u[1]*u[1];
+	V x20 = T{3.00000000000000}*x19;
+	V x21 = x18 + x20 + T{-2.00000000000000};
+	V x28 = -u[1];
+	V x29 = x28 + u[0];
+	V x34 = u[0] + u[1];
+	V x35 = T{9.00000000000000}*(x34*x34);
+	V x37 = -x20;
+	V x39 = x37 + x38;
+	V x85 = T{6.00000000000000}*x19;
+	u[2] = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]);
+	V x15 = u[2]*u[2];
+	V x16 = T{3.00000000000000}*x15;
+	V x22 = x16 + x21;
+	V x23 = x14 + x22;
+	V x24 = rho*(x11 + x23 - T{9.00000000000000}*x13*x13);
+	V x25 = x24 + x9;
+	V x30 = x10 + x22;
+	V x31 = rho*(x27 + x30 - T{9.00000000000000}*x29*x29);
+	V x32 = x26 + x31;
+	V x36 = rho*(x14 + x30 - x35);
+	V x40 = -x16;
+	V x41 = x10 + x40;
+	V x42 = rho*(x14 + x35 + x39 + x41) - T{72.0000000000000}*f_curr[7];
+	V x43 = -x33 - x36 + x42;
+	V x45 = T{6.00000000000000}*u[2];
+	V x46 = -x45;
+	V x47 = x12 + u[2];
+	V x48 = rho*(x23 + x46 - T{9.00000000000000}*x47*x47);
+	V x49 = x44 + x48;
+	V x51 = -u[2];
+	V x52 = x51 + u[0];
+	V x53 = x22 + x45;
+	V x54 = rho*(x27 + x53 - T{9.00000000000000}*x52*x52);
+	V x55 = x50 + x54;
+	V x57 = u[0] + u[2];
+	V x58 = T{9.00000000000000}*(x57*x57);
+	V x59 = rho*(x14 + x53 - x58);
+	V x60 = x39 + x45;
+	V x61 = x14 + x40;
+	V x62 = rho*(x58 + x60 + x61) - T{72.0000000000000}*f_curr[3];
+	V x63 = -x56 - x59 + x62;
+	V x65 = x28 + u[2];
+	V x66 = rho*(x30 + x46 - T{9.00000000000000}*x65*x65);
+	V x67 = x64 + x66;
+	V x69 = x51 + u[1];
+	V x70 = rho*(x11 + x53 - T{9.00000000000000}*x69*x69);
+	V x71 = x68 + x70;
+	V x73 = u[1] + u[2];
+	V x74 = T{9.00000000000000}*(x73*x73);
+	V x75 = rho*(x30 + x45 - x74);
+	V x76 = rho*(x41 + x60 + x74) - T{72.0000000000000}*f_curr[0];
+	V x77 = -x72 - x75 + x76;
+	V x80 = x16 + T{-2.00000000000000};
+	V x81 = x14 + x20 - x79 + x80;
+	V x82 = x37 + x61 + x79 + T{2.00000000000000};
+	V x83 = -x44 - x48 - x50 - x54 + x63;
+	V x84 = -x24 - x26 - x31 + x43 - x9;
+	V x86 = x10 + x18 + x80 - x85;
+	V x87 = x38 + x41 + x85;
+	V x88 = -x64 - x66 - x68 - x70 + x77;
+	V x89 = T{6.00000000000000}*x15;
+	V x90 = x21 + x45 - x89;
+	V x91 = x60 + x89;
+	V x92 = T{1}/(T{20.0000000000000}*tau + T{2.00000000000000} + T{5.04537849152229}*simd::sqrt<T>((smagorinsky*smagorinsky)*simd::sqrt<T>((x25 + x32 + x43)*(x25 + x32 + x43) + (x49 + x55 + x63)*(x49 + x55 + x63) + (x67 + x71 + x77)*(x67 + x71 + x77) + T{0.500000000000000}*((-x78*x81 + x78*x82 + x83 + x84 - T{72}*f_curr[10] - T{72}*f_curr[8])*(-x78*x81 + x78*x82 + x83 + x84 - T{72}*f_curr[10] - T{72}*f_curr[8])) + T{0.500000000000000}*((-x78*x86 + x78*x87 + x84 + x88 - T{72}*f_curr[12] - T{72}*f_curr[6])*(-x78*x86 + x78*x87 + x84 + x88 - T{72}*f_curr[12] - T{72}*f_curr[6])) + T{0.500000000000000}*((-x78*x90 + x78*x91 + x83 + x88 - T{72}*f_curr[16] - T{72}*f_curr[2])*(-x78*x90 + x78*x91 + x83 + x88 - T{72}*f_curr[16] - T{72}*f_curr[2]))) + T{0.157134840263677}*(x8*x8)));
+	f_next[0] = T{0.333333333333333}*x76*x92 + f_curr[0];
+	f_next[1] = -T{0.333333333333333}*x49*x92 + f_curr[1];
+	f_next[2] = T{0.666666666666667}*x92*(rho*x91 - T{36.0000000000000}*f_curr[2]) + f_curr[2];
+	f_next[3] = T{0.333333333333333}*x62*x92 + f_curr[3];
+	f_next[4] = -T{0.333333333333333}*x67*x92 + f_curr[4];
+	f_next[5] = -T{0.333333333333333}*x25*x92 + f_curr[5];
+	f_next[6] = T{0.666666666666667}*x92*(rho*x87 - T{36.0000000000000}*f_curr[6]) + f_curr[6];
+	f_next[7] = T{0.333333333333333}*x42*x92 + f_curr[7];
+	f_next[8] = -T{0.666666666666667}*x92*(rho*x81 + T{36.0000000000000}*f_curr[8]) + f_curr[8];
+	f_next[9] = -T{4.00000000000000}*x92*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9];
+	f_next[10] = T{0.666666666666667}*x92*(rho*x82 - T{36.0000000000000}*f_curr[10]) + f_curr[10];
+	f_next[11] = -T{0.333333333333333}*x92*(x33 + x36) + f_curr[11];
+	f_next[12] = -T{0.666666666666667}*x92*(rho*x86 + T{36.0000000000000}*f_curr[12]) + f_curr[12];
+	f_next[13] = -T{0.333333333333333}*x32*x92 + f_curr[13];
+	f_next[14] = -T{0.333333333333333}*x71*x92 + f_curr[14];
+	f_next[15] = -T{0.333333333333333}*x92*(x56 + x59) + f_curr[15];
+	f_next[16] = -T{0.666666666666667}*x92*(rho*x90 + T{36.0000000000000}*f_curr[16]) + f_curr[16];
+	f_next[17] = -T{0.333333333333333}*x55*x92 + f_curr[17];
+	f_next[18] = -T{0.333333333333333}*x92*(x72 + x75) + f_curr[18];
+}
+
+};
diff --git a/src/LLBM/wall.h b/src/LLBM/wall.h
new file mode 100644
index 0000000..26534e4
--- /dev/null
+++ b/src/LLBM/wall.h
@@ -0,0 +1,4 @@
+#pragma once
+
+template<int N_0, int N_1, int N_2=0>
+struct WallNormal { };
diff --git a/src/concepts.h b/src/concepts.h
new file mode 100644
index 0000000..61f7cbf
--- /dev/null
+++ b/src/concepts.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <concepts>
+
+namespace concepts {
+
+template <typename V>
+concept Arithmetic = requires(V a, V b) {
+  a +  b;
+  a += b;
+
+  a -  b;
+  a -= b;
+
+  a *  b;
+  a *= b;
+
+  a /  b;
+  a /= b;
+};
+
+}
diff --git a/src/cuboid.h b/src/cuboid.h
new file mode 100644
index 0000000..c4ff803
--- /dev/null
+++ b/src/cuboid.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <array>
+
+class Cuboid {
+private:
+	const std::array<std::size_t,3> _size;
+	const std::size_t _volume;
+
+public:
+	Cuboid(std::size_t nX, std::size_t nY, std::size_t nZ):
+		_size{nX, nY, nZ},
+		_volume(nX*nY*nZ) { }
+
+	std::size_t volume() const {
+		return _volume;
+	}
+
+	std::size_t index(unsigned iX, unsigned iY, unsigned iZ) const {
+		return iX*_size[1]*_size[2] + iY*_size[2] + iZ;
+	}
+
+	int operator[](unsigned i) const {
+		return _size[i];
+	}
+
+	template <typename F>
+	void traverse(F f) {
+		for (int iX = 0; iX < _size[0]; ++iX) {
+			for (int iY = 0; iY < _size[1]; ++iY) {
+				for (int iZ = 0; iZ < _size[2]; ++iZ) {
+					f(iX, iY, iZ, index(iX,iY,iZ));
+				}
+			}
+		}
+	}
+};
diff --git a/src/export.h b/src/export.h
new file mode 100644
index 0000000..a3d38ec
--- /dev/null
+++ b/src/export.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <span>
+#include <fstream>
+
+namespace export_format {
+
+template <std::floating_point T>
+void vtk(Cuboid c, std::span<T> rho, std::span<T> u, std::string path) {
+	std::ofstream fout;
+	fout.open(path.c_str());
+
+	fout << "# vtk DataFile Version 3.0\n";
+	fout << "lbm_output\n";
+	fout << "ASCII\n";
+	fout << "DATASET RECTILINEAR_GRID\n";
+	fout << "DIMENSIONS " << c[0] << " " << c[1] << " " << c[2] << "\n";
+
+	fout << "X_COORDINATES " << c[0] << " float\n";
+	for (std::size_t iX=0; iX < c[0]; ++iX) {
+		fout << iX << " ";
+	}
+
+	fout << "\nY_COORDINATES " << c[1] << " float\n";
+	for (std::size_t iY=0; iY < c[1]; ++iY) {
+		fout << iY << " ";
+	}
+
+	fout << "\nZ_COORDINATES " << c[2] << " float\n";
+	for (std::size_t iZ=0; iZ < c[2]; ++iZ) {
+		fout << iZ << " ";
+	}
+	fout << "\nPOINT_DATA " << c.volume() << "\n";
+
+	fout << "VECTORS velocity float\n";
+	for (std::size_t iZ=0; iZ < c[2]; ++iZ) {
+		for (std::size_t iY=0; iY < c[1]; ++iY) {
+			for (std::size_t iX=0; iX < c[0]; ++iX) {
+				const std::size_t iCell = c.index(iX,iY,iZ);
+				fout << u[iCell] << " " << u[c.volume() + iCell] << " " << u[2*c.volume() + iCell] << "\n";
+			}
+		}
+	}
+
+	fout << "SCALARS density float 1\n";
+	fout << "LOOKUP_TABLE default\n";
+	for (std::size_t iZ=0; iZ < c[2]; ++iZ) {
+		for (std::size_t iY=0; iY < c[1]; ++iY) {
+			for (std::size_t iX=0; iX < c[0]; ++iX) {
+				const std::size_t iCell = c.index(iX,iY,iZ);
+				fout << rho[iCell] << "\n";
+			}
+		}
+	}
+
+	fout.close();
+}
+
+}
diff --git a/src/lattice.h b/src/lattice.h
new file mode 100644
index 0000000..bbebab4
--- /dev/null
+++ b/src/lattice.h
@@ -0,0 +1,267 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#ifdef ENABLE_AVX512
+#include "simd/512.h"
+#else
+#include "simd/256.h"
+#endif
+
+#include <omp.h>
+
+#include "cuboid.h"
+#include "population.h"
+#include "propagation.h"
+#include "operator.h"
+#include "export.h"
+
+#include "LLBM/initialize.h"
+#include "LLBM/collect_moments.h"
+
+template <typename T>
+class LatticeMask {
+private:
+	const std::size_t _size;
+
+	std::unique_ptr<typename simd::Mask<T>::storage_t[]> _simd_mask;
+	std::unique_ptr<bool[]> _base_mask;
+
+public:
+	LatticeMask(std::size_t nCells):
+		_size(((nCells + simd::Pack<T>::size - 1) / simd::Mask<T>::storage_size + 1) * simd::Mask<T>::storage_size),
+		_simd_mask(new typename simd::Mask<T>::storage_t[_size / simd::Mask<T>::storage_size] { }),
+		_base_mask(new bool[_size] { })
+	{ }
+
+	void set(std::size_t iCell, bool state) {
+		_base_mask[iCell] = state;
+	}
+
+	bool get(std::size_t iCell) const {
+		return _base_mask[iCell];
+	}
+
+	void serialize() {
+		for (std::size_t i=0; i < _size / simd::Mask<T>::storage_size; ++i) {
+			_simd_mask[i] = simd::Mask<T>::encode(_base_mask.get() + i*simd::Mask<T>::storage_size);
+		}
+	}
+
+	typename simd::Mask<T>::storage_t* data() {
+		return _simd_mask.get();
+	}
+
+};
+
+template <concepts::PropagatablePopulationBuffer Buffer>
+class Lattice {
+private:
+	using value_t = typename Buffer::value_t;
+	using pack_t = simd::Pack<value_t>;
+	using mask_t = simd::Mask<value_t>;
+
+	Cuboid _cuboid;
+	Buffer _population;
+
+	template <typename COP>
+	void apply(COP cop,
+		         std::size_t iCell,
+		         pack_t f_curr[population::q],
+		         pack_t f_next[population::q]) {
+		if (mask_t m = {cop.mask.data(), iCell}) {
+			cop(f_curr, f_next);
+
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				simd::maskstore(get(iPop, stage::post_collision()) + iCell, m, f_next[iPop]);
+			}
+		}
+	}
+
+public:
+	Lattice(Cuboid cuboid):
+		_cuboid(cuboid),
+		_population(cuboid)
+	{
+		apply<InitializeO>();
+	}
+
+	template <typename STAGE>
+	value_t* get(unsigned iPop, STAGE stage) {
+		return _population.get(iPop, stage);
+	}
+
+	std::size_t volume() const {
+		return _cuboid.volume();
+	}
+
+	void stream() {
+		_population.stream();
+	}
+
+	template <typename F, typename... ARGS>
+  requires concepts::Operator<F,value_t>
+	void apply(std::size_t iCell, ARGS&&... args) {
+		value_t f_curr[population::q];
+		value_t f_next[population::q];
+		#pragma GCC unroll population::q
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			f_curr[iPop] = get(iPop, stage::pre_collision())[iCell];
+		}
+
+		F::apply(f_curr, f_next, std::forward<ARGS>(args)...);
+
+		#pragma GCC unroll population::q
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			get(iPop, stage::post_collision())[iCell] = f_next[iPop];
+		}
+	}
+
+	template <typename F>
+	requires concepts::Operator<F,pack_t>
+	void apply() {
+		pack_t f_curr[population::q];
+		pack_t f_next[population::q];
+		#pragma omp parallel for private(f_curr, f_next) schedule(static)
+		for (std::size_t i=0; i < volume() / pack_t::size; ++i) {
+			const std::size_t iCell = pack_t::size * i;
+
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell);
+			}
+
+			F::apply(f_curr, f_next);
+
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				simd::store(get(iPop, stage::post_collision()) + iCell, f_next[iPop]);
+			}
+		}
+
+		for (std::size_t iCell = pack_t::size * (volume() / pack_t::size);
+				             iCell < volume();
+				           ++iCell) {
+			apply<F>(iCell);
+		}
+	}
+
+	template <typename F, typename... ARGS>
+	requires concepts::Operator<F,pack_t,ARGS...>
+	void apply(LatticeMask<value_t>& mask, ARGS&&... args) {
+		pack_t f_curr[population::q];
+		pack_t f_next[population::q];
+		#pragma omp parallel for private(f_curr, f_next) schedule(static)
+		for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) {
+			mask_t m(mask.data(), iCell);
+			if (m) {
+				#pragma GCC unroll population::q
+				for (unsigned iPop=0; iPop < population::q; ++iPop) {
+					f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell);
+				}
+
+				F::apply(f_curr, f_next, std::forward<ARGS>(args)...);
+
+				#pragma GCC unroll population::q
+				for (unsigned iPop=0; iPop < population::q; ++iPop) {
+					simd::maskstore(get(iPop, stage::post_collision()) + iCell, m, f_next[iPop]);
+				}
+			}
+		}
+	}
+
+	template <typename... COPs>
+	void apply(COPs&&... cops) {
+		pack_t f_curr[population::q];
+		pack_t f_next[population::q];
+		#pragma omp parallel for private(f_curr, f_next) schedule(static)
+		for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) {
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell);
+			}
+
+			(apply(cops, iCell, f_curr, f_next), ...);
+		}
+	}
+
+	template <typename F, typename... ARGS>
+	void apply(std::vector<typename pack_t::index_t>& cells, ARGS&&... args) {
+	#ifdef SIMD_CELL_LIST
+		pack_t f_curr[population::q];
+		pack_t f_next[population::q];
+		#pragma omp parallel for private(f_curr, f_next) schedule(static)
+		for (std::size_t i=0; i < cells.size() / pack_t::size; ++i) {
+			const std::size_t iIdx = pack_t::size * i;
+			pack_t::index_t* locs = cells.data() + iIdx;
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()), locs);
+			}
+
+			F::apply(f_curr, f_next, std::forward<ARGS>(args)...);
+
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				simd::store(get(iPop, stage::post_collision()), f_next[iPop], locs);
+			}
+		}
+
+		for (std::size_t i = pack_t::size * (cells.size() / pack_t::size);
+				             i < cells.size();
+				           ++i) {
+			apply<F>(cells[i], std::forward<ARGS>(args)...);
+		}
+	#else
+		value_t f_curr[population::q];
+		value_t f_next[population::q];
+		#pragma omp parallel for private(f_curr, f_next) schedule(static)
+		for (std::size_t i=0; i < cells.size(); ++i) {
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				f_curr[iPop] = get(iPop, stage::pre_collision())[cells[i]];
+			}
+
+			F::apply(f_curr, f_next, std::forward<ARGS>(args)...);
+
+			#pragma GCC unroll population::q
+			for (unsigned iPop=0; iPop < population::q; ++iPop) {
+				get(iPop, stage::post_collision())[cells[i]] = f_next[iPop];
+			}
+		}
+	#endif
+	}
+
+	template <typename F>
+	requires concepts::Functor<F,pack_t>
+	void observe(LatticeMask<value_t>& mask, std::span<value_t> buffer) {
+		pack_t f[population::q];
+		pack_t output[F::size];
+		#pragma omp parallel for private(f,output) schedule(static)
+		for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) {
+			mask_t m(mask.data(), iCell);
+			if (m) {
+				for (unsigned iPop=0; iPop < population::q; ++iPop) {
+					f[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell);
+				}
+
+				CollectMomentsF::observe(f, output);
+
+				for (unsigned i=0; i < F::size; ++i) {
+					simd::maskstore(buffer.data() + i*volume() + iCell, m, output[i]);
+				}
+			}
+		}
+	}
+
+	void write_momenta(LatticeMask<value_t>& mask, std::string path) {
+		std::vector<value_t> buffer(CollectMomentsF::size*volume());
+		observe<CollectMomentsF>(mask, buffer);
+		export_format::vtk<value_t>(_cuboid,
+		                            { buffer.data(),            1*volume() },
+		                            { buffer.data() + volume(), 3*volume() },
+		                            path);
+	}
+};
diff --git a/src/operator.h b/src/operator.h
new file mode 100644
index 0000000..b4830eb
--- /dev/null
+++ b/src/operator.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <tuple>
+
+#include "concepts.h"
+
+namespace concepts {
+
+template <typename F, typename T, typename... ARGS>
+concept Operator = Arithmetic<T>
+                && requires(F op, T* f_curr, T* f_next, ARGS... args) {
+	F::apply(f_curr, f_next, args...);
+};
+
+template <typename F, typename T>
+concept Functor = Arithmetic<T>
+               && requires(F f, T* f_curr, T output[F::size]) {
+	F::observe(f_curr, output);
+};
+
+}
+
+template <typename F, typename M, typename... ARGS>
+struct Operator {
+	M& mask;
+	std::tuple<ARGS...> args;
+
+	Operator(F, M& m, ARGS&&... args):
+		mask(m),
+		args(std::forward<ARGS&&>(args)...) { }
+
+	template <typename U>
+	requires concepts::Operator<F,U,ARGS...>
+	void operator()(U f_curr[], U f_next[]) {
+		std::apply([](auto... args) { F::apply(args...); },
+		           std::tuple_cat(std::make_tuple(f_curr, f_next), args));
+	}
+};
+
+template <typename F, typename M, typename... ARGS>
+Operator(F, M&, ARGS&&... args) -> Operator<F,M,ARGS...>;
diff --git a/src/pattern/all.h b/src/pattern/all.h
new file mode 100644
index 0000000..6496e02
--- /dev/null
+++ b/src/pattern/all.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "pattern/ps.h"
+#include "pattern/sss.h"
+#include "pattern/none.h"
diff --git a/src/pattern/none.h b/src/pattern/none.h
new file mode 100644
index 0000000..8abc419
--- /dev/null
+++ b/src/pattern/none.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "population.h"
+#include "propagation.h"
+
+#include <unistd.h>
+
+namespace pattern {
+
+template <concepts::Arithmetic T>
+class NONE {
+private:
+	Cuboid _cuboid;
+
+	T* _base;
+	T* _f[population::q];
+
+public:
+	using value_t = T;
+
+	NONE(Cuboid cuboid):
+		_cuboid(cuboid)
+	{
+		const std::size_t pagesize = sysconf(_SC_PAGESIZE);
+		const std::size_t padding = _cuboid.index(1,1,1);
+		_base = static_cast<T*>(
+		  std::aligned_alloc(pagesize, population::q * (_cuboid.volume() + 2*padding) * sizeof(T)));
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			_f[iPop] = _base + iPop * (_cuboid.volume() + 2*padding) + padding;
+		}
+	}
+
+	~NONE() {
+		delete [] _base;
+	}
+
+	T* get(unsigned iPop, stage::pre_collision) {
+		return _f[iPop];
+	}
+
+	T* get(unsigned iPop, stage::post_collision) {
+		return _f[iPop];
+	}
+
+	void stream() { }
+};
+
+}
diff --git a/src/pattern/ps.h b/src/pattern/ps.h
new file mode 100644
index 0000000..667b09f
--- /dev/null
+++ b/src/pattern/ps.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "population.h"
+#include "propagation.h"
+
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+const int PROT_RW = PROT_READ | PROT_WRITE;
+
+constexpr std::uint32_t log2(std::uint32_t x) {
+	return 31 - __builtin_clz(x);
+}
+
+namespace pattern {
+
+template <concepts::Arithmetic T>
+class PS {
+private:
+	Cuboid _cuboid;
+	std::ptrdiff_t _volume;
+
+	std::string _shm_path;
+	int _shm_name;
+	int _shm_file;
+
+	std::uint8_t* _base_buffer;
+	std::uint8_t* _buffer[population::q];
+
+	T* _base[population::q];
+	T* _f[population::q];
+
+public:
+	using value_t = T;
+
+	PS(Cuboid cuboid):
+		_cuboid(cuboid)
+	{
+		const std::size_t page_size = sysconf(_SC_PAGESIZE);
+		const std::size_t line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+		const std::size_t size = ((_cuboid.volume() * sizeof(T) - 1) / page_size + 1) * page_size;
+		_volume = size / sizeof(T);
+
+		if (size % page_size != 0) {
+			throw std::invalid_argument("Array size must be multiple of PAGE_SIZE");
+		}
+
+		_shm_path = "/lbm_XXXXXX";
+		_shm_name = mkstemp(const_cast<char*>(_shm_path.data()));
+		if (_shm_name != -1) {
+			throw std::runtime_error("Could not generate unique shared memory object name");
+		}
+		// Open shared memory object as physical lattice memory
+		_shm_file = shm_open(_shm_path.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+		if (_shm_file == -1) {
+			throw std::runtime_error("Failed to create shared memory object");
+		}
+		// Resize to fit lattice populations
+		if (ftruncate(_shm_file, population::q * size) == -1) {
+			throw std::runtime_error("Failed to resize shared memory object");
+		}
+
+		// Allocate virtual address space for q times two consecutive lattices
+		_base_buffer = static_cast<std::uint8_t*>(
+		  mmap(NULL, population::q * 2 * size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			_buffer[iPop] = _base_buffer + iPop * 2 * size;
+			// Map single physical lattice into virtual address space
+			mmap(_buffer[iPop]       , size, PROT_RW, MAP_SHARED | MAP_FIXED, _shm_file, iPop * size);
+			mmap(_buffer[iPop] + size, size, PROT_RW, MAP_SHARED | MAP_FIXED, _shm_file, iPop * size);
+
+			// Store base pointer for reference
+			_base[iPop] = reinterpret_cast<T*>(_buffer[iPop]);
+			// Initialize shiftable f pointer to be used for lattice access
+			_f[iPop] = _base[iPop];
+		}
+
+		// Pre-shift to reduce cache and TLB conflict misses
+		// due to alignment of start address and page-boundaries every `page_size/sizeof(T)`-steps
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			std::ptrdiff_t shift = iPop * (1 << log2(line_size))
+			                     + iPop * (1 << log2(page_size));
+			shift /= sizeof(T);
+			if (shift < _volume-1) {
+				_f[iPop] += shift;
+			}
+		}
+	}
+
+	~PS() {
+		munmap(_base_buffer, population::q * 2 * _volume * sizeof(T));
+		shm_unlink(_shm_path.c_str());
+		close(_shm_name);
+		unlink(_shm_path.c_str());
+	}
+
+	T* get(unsigned iPop, stage::pre_collision) {
+		return _f[iPop];
+	}
+
+	T* get(unsigned iPop, stage::post_collision) {
+		return _f[iPop];
+	}
+
+	void stream() {
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+		  _f[iPop] -= population::offset(_cuboid, iPop);
+		}
+
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			if (_f[iPop] - _base[iPop] >= _volume) {
+				_f[iPop] -= _volume;
+			} else if (_f[iPop] < _base[iPop]) {
+				_f[iPop] += _volume;
+			}
+		}
+	}
+
+};
+
+}
diff --git a/src/pattern/sss.h b/src/pattern/sss.h
new file mode 100644
index 0000000..1f1985d
--- /dev/null
+++ b/src/pattern/sss.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "population.h"
+#include "propagation.h"
+
+#include "cuboid.h"
+#include "concepts.h"
+#include "population.h"
+
+#include <unistd.h>
+
+namespace pattern {
+
+template <concepts::Arithmetic T>
+class SSS {
+private:
+	Cuboid _cuboid;
+
+	T* _base;
+	T* _buffer[population::q];
+	T* _f[population::q];
+
+public:
+	using value_t = T;
+
+	SSS(Cuboid cuboid):
+		_cuboid(cuboid)
+	{
+		const std::size_t pagesize = sysconf(_SC_PAGESIZE);
+		const std::ptrdiff_t padding = population::max_offset(_cuboid);
+		_base = static_cast<T*>(
+		  std::aligned_alloc(pagesize, population::q * (_cuboid.volume() + 2*padding) * sizeof(T)));
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+			_buffer[iPop] = _base + iPop * (_cuboid.volume() + 2*padding);
+			_f[iPop] = _buffer[iPop] + padding;
+		}
+	}
+
+	~SSS() {
+		delete [] _base;
+	}
+
+	T* get(unsigned iPop, stage::pre_collision) {
+		return _f[iPop];
+	}
+
+	T* get(unsigned iPop, stage::post_collision) {
+		return _f[population::opposite(iPop)];
+	}
+
+	void stream() {
+		T* f_old[population::q];
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+		  f_old[iPop] = _f[iPop];
+		}
+		for (unsigned iPop=0; iPop < population::q; ++iPop) {
+		  _f[iPop] = f_old[population::opposite(iPop)] - population::offset(_cuboid, iPop);
+		}
+	}
+
+};
+
+}
diff --git a/src/population.h b/src/population.h
new file mode 100644
index 0000000..14c9b77
--- /dev/null
+++ b/src/population.h
@@ -0,0 +1,41 @@
+#pragma once
+
+namespace population {
+
+constexpr unsigned d = 3;
+constexpr unsigned q = 19;
+
+constexpr int velocity[q][d] = {
+	{ 0, 1, 1}, {-1, 0, 1}, { 0, 0, 1}, { 1, 0, 1}, { 0,-1, 1},
+	{-1, 1, 0}, { 0, 1, 0}, { 1, 1, 0}, {-1, 0, 0}, { 0, 0, 0}, { 1, 0, 0}, {-1,-1, 0}, { 0,-1, 0}, { 1,-1, 0},
+	{ 0, 1,-1}, {-1, 0,-1}, { 0, 0,-1}, { 1, 0,-1}, { 0,-1,-1}
+};
+
+unsigned opposite(unsigned iPop) {
+	return q - 1 - iPop;
+}
+
+std::ptrdiff_t offset(const Cuboid& c, int iX, int iY, int iZ) {
+	return iX*c[1]*c[2] + iY*c[2] + iZ;
+}
+
+std::ptrdiff_t offset(const Cuboid& c, unsigned iPop) {
+	return offset(c, velocity[iPop][0], velocity[iPop][1], velocity[iPop][2]);
+}
+
+std::ptrdiff_t max_offset(const Cuboid& c) {
+	std::ptrdiff_t padding = 0;
+	for (unsigned iPop=0; iPop < q; ++iPop) {
+		padding = std::max(padding, std::abs(offset(c, iPop)));
+	}
+	return padding;
+}
+
+}
+
+namespace stage {
+
+struct pre_collision { };
+struct post_collision { };
+
+}
diff --git a/src/propagation.h b/src/propagation.h
new file mode 100644
index 0000000..0bcb435
--- /dev/null
+++ b/src/propagation.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "cuboid.h"
+#include "concepts.h"
+
+namespace concepts {
+
+template <typename BUFFER>
+concept CuboidConstructible = std::constructible_from<BUFFER, Cuboid>;
+
+template <typename BUFFER>
+concept PopulationAccess = Arithmetic<typename BUFFER::value_t>
+	                      && requires(BUFFER& b) {
+	{ b.get(unsigned{}, stage::pre_collision())  } -> std::same_as<typename BUFFER::value_t*>;
+	{ b.get(unsigned{}, stage::post_collision()) } -> std::same_as<typename BUFFER::value_t*>;
+};
+
+template <typename BUFFER>
+concept Propagatable = requires(BUFFER& b) {
+	b.stream();
+};
+
+template <typename BUFFER>
+concept PropagatablePopulationBuffer = CuboidConstructible<BUFFER>
+	                                  && PopulationAccess<BUFFER>
+                                    && Propagatable<BUFFER>;
+
+}
diff --git a/src/simd/256.h b/src/simd/256.h
new file mode 100644
index 0000000..a3f419d
--- /dev/null
+++ b/src/simd/256.h
@@ -0,0 +1,379 @@
+#pragma once
+
+#include <immintrin.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace simd {
+
+template <std::floating_point T>
+class Mask;
+
+template <>
+class Mask<double> {
+private:
+	__m256i _reg;
+
+public:
+	using storage_t = std::uint64_t;
+	static constexpr unsigned storage_size = 1;
+
+	static constexpr storage_t true_v	 = 1l << 63;
+	static constexpr storage_t false_v = 0l;
+
+	static storage_t encode(bool value) {
+		return value ? true_v : false_v;
+	}
+
+	static storage_t encode(bool* value) {
+		return encode(*value);
+	}
+
+	Mask(bool a, bool b, bool c, bool d):
+		_reg(_mm256_set_epi64x(encode(d),encode(c),encode(b),encode(a))) { }
+
+	Mask(std::uint64_t a, std::uint64_t b, std::uint64_t c, std::uint64_t d):
+		_reg(_mm256_set_epi64x(d,c,b,a)) { }
+
+	Mask(std::uint64_t* ptr):
+		_reg(_mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr))) { }
+
+	Mask(storage_t* ptr, std::size_t iCell):
+		Mask<double>(ptr + iCell) { }
+
+	Mask(__m256i reg):
+		_reg(reg) { }
+
+	operator __m256i() {
+		return _reg;
+	}
+
+	__m256i neg() const {
+		return _mm256_sub_epi64(_mm256_set1_epi64x(true_v), _reg);
+	}
+
+	operator bool() const {
+		const std::uint64_t* values = reinterpret_cast<const std::uint64_t*>(&_reg);
+		return values[0] == true_v
+		    || values[1] == true_v
+		    || values[2] == true_v
+		    || values[3] == true_v;
+	}
+};
+
+template <>
+class Mask<float> {
+private:
+	__m256i _reg;
+
+public:
+	using storage_t = std::uint32_t;
+	static constexpr unsigned storage_size = 1;
+
+	static constexpr storage_t true_v	 = 1 << 31;
+	static constexpr storage_t false_v = 0;
+
+	static storage_t encode(bool value) {
+		return value ? true_v : false_v;
+	}
+
+	static storage_t encode(bool* value) {
+		return encode(*value);
+	}
+
+	Mask(storage_t* ptr):
+		_reg(_mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr))) { }
+
+	Mask(storage_t* ptr, std::size_t iCell):
+		Mask<float>(ptr + iCell) { }
+
+	Mask(__m256i reg):
+		_reg(reg) { }
+
+	operator __m256i() {
+		return _reg;
+	}
+
+	__m256i neg() const {
+		return _mm256_sub_epi32(_mm256_set1_epi32(true_v), _reg);
+	}
+
+	operator bool() const {
+		const std::uint32_t* values = reinterpret_cast<const std::uint32_t*>(&_reg);
+		return values[0] == true_v
+		    || values[1] == true_v
+		    || values[2] == true_v
+		    || values[3] == true_v
+		    || values[4] == true_v
+		    || values[5] == true_v
+		    || values[6] == true_v
+		    || values[7] == true_v;
+	}
+};
+
+
+template <std::floating_point T>
+class Pack;
+
+template <>
+class Pack<double> {
+private:
+	__m256d _reg;
+
+public:
+	using mask_t = Mask<double>;
+	using index_t = std::uint32_t;
+
+	static constexpr std::size_t size = 4;
+
+	Pack() = default;
+
+	Pack(__m256d reg):
+		_reg(reg) { }
+
+	Pack(double val):
+		Pack(_mm256_set1_pd(val)) { }
+
+	Pack(double a, double b, double c, double d):
+		Pack(_mm256_set_pd(d,c,b,a)) { }
+
+	Pack(double* ptr):
+		Pack(_mm256_loadu_pd(ptr)) {
+		_mm_prefetch(ptr + size, _MM_HINT_T2);
+	}
+
+	Pack(double* ptr, index_t* idx):
+		Pack(_mm256_i32gather_pd(ptr, _mm_loadu_si128(reinterpret_cast<__m128i*>(idx)), sizeof(double))) { }
+
+	operator __m256d() {
+		return _reg;
+	}
+
+	Pack operator+(Pack rhs) {
+		return Pack(_mm256_add_pd(_reg, rhs));
+	}
+
+	Pack& operator+=(Pack rhs) {
+		_reg = _mm256_add_pd(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator-(Pack rhs) {
+		return Pack(_mm256_sub_pd(_reg, rhs));
+	}
+
+	Pack& operator-=(Pack rhs) {
+		_reg = _mm256_sub_pd(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator*(Pack rhs) {
+		return Pack(_mm256_mul_pd(_reg, rhs));
+	}
+
+	Pack& operator*=(Pack rhs) {
+		_reg = _mm256_mul_pd(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator/(Pack rhs) {
+		return Pack(_mm256_div_pd(_reg, rhs));
+	}
+
+	Pack& operator/=(Pack rhs) {
+		_reg = _mm256_div_pd(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator-() {
+		return *this * Pack(-1.);
+	}
+
+	__m256d sqrt() {
+		return _mm256_sqrt_pd(_reg);
+	}
+};
+
+template <>
+class Pack<float> {
+private:
+	__m256 _reg;
+
+public:
+	using mask_t = Mask<float>;
+	using index_t = std::uint32_t;
+
+	static constexpr std::size_t size = 8;
+
+	Pack() = default;
+
+	Pack(__m256 reg):
+		_reg(reg) { }
+
+	Pack(float val):
+		Pack(_mm256_set1_ps(val)) { }
+
+	Pack(float a, float b, float c, float d, float e, float f, float g, float h):
+		Pack(_mm256_set_ps(h,g,f,e,d,c,b,a)) { }
+
+	Pack(float* ptr):
+		Pack(_mm256_loadu_ps(ptr)) {
+		_mm_prefetch(ptr + size, _MM_HINT_T2);
+	}
+
+	Pack(float* ptr, index_t* idx):
+		Pack(_mm256_i32gather_ps(ptr, _mm256_loadu_si256(reinterpret_cast<__m256i*>(idx)), sizeof(float))) { }
+
+	operator __m256() {
+		return _reg;
+	}
+
+	Pack operator+(Pack rhs) {
+		return Pack(_mm256_add_ps(_reg, rhs));
+	}
+
+	Pack& operator+=(Pack rhs) {
+		_reg = _mm256_add_ps(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator-(Pack rhs) {
+		return Pack(_mm256_sub_ps(_reg, rhs));
+	}
+
+	Pack& operator-=(Pack rhs) {
+		_reg = _mm256_sub_ps(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator*(Pack rhs) {
+		return Pack(_mm256_mul_ps(_reg, rhs));
+	}
+
+	Pack& operator*=(Pack rhs) {
+		_reg = _mm256_mul_ps(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator/(Pack rhs) {
+		return Pack(_mm256_div_ps(_reg, rhs));
+	}
+
+	Pack& operator/=(Pack rhs) {
+		_reg = _mm256_div_ps(_reg, rhs);
+		return *this;
+	}
+
+	Pack operator-() {
+		return *this * Pack(-1.);
+	}
+
+	__m256 sqrt() {
+		return _mm256_sqrt_ps(_reg);
+	}
+};
+
+
+template <typename T>
+Pack<T> operator+(T lhs, Pack<T> rhs) {
+	return Pack<T>(lhs) + rhs;
+}
+
+template <typename T>
+Pack<T> operator+(Pack<T> lhs, T rhs) {
+	return lhs + Pack<T>(rhs);
+}
+
+template <typename T>
+Pack<T> operator-(T lhs, Pack<T>