28 files changed, 2425 insertions, 0 deletions
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..0b225af
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,2 @@
+[*]
+indent_style = tab
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..c5b383b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,72 @@
+cmake_minimum_required(VERSION 3.10)
+project(SweepLB LANGUAGES CXX)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(SIMD_MODE "AVX256" CACHE STRING "Generate either AVX256 or AVX512 instructions")
+option(USE_SIMD_CELL_LIST "Generate gather / scatter instructions for cell list processing" OFF)
+
+if(NOT SIMD_MODE)
+	set(SIMD_MODE "AVX256")
+endif()
+
+set(CXX_FLAGS "-O3 -march=native -mtune=native")
+
+if(${SIMD_MODE} MATCHES "AVX256")
+	message("Selecting AVX256")
+	set(SIMD_MODE_FLAGS "-mavx2")
+elseif(${SIMD_MODE} MATCHES "AVX512")
+	message("Selecting AVX512")
+	add_compile_definitions(ENABLE_AVX512)
+	set(SIMD_MODE_FLAGS "-mavx512f -mavx512dq")
+else()
+  message(FATAL_ERROR "Invalid SIMD mode, choose either AVX256 or AVX512")
+endif()
+
+if(USE_SIMD_CELL_LIST)
+  message("Enabling SIMD cell list processing.")
+  add_compile_definitions(SIMD_CELL_LIST)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CXX_FLAGS} ${SIMD_MODE_FLAGS}")
+
+message("Using flags \"${CMAKE_CXX_FLAGS}\"")
+
+include_directories(
+  src/
+)
+
+find_package(OpenMP)
+
+set(EXAMPLES ldc box channel)
+set(PRECISIONS float double)
+set(PATTERNS sss ps)
+
+foreach(example IN LISTS EXAMPLES)
+	foreach(precision IN LISTS PRECISIONS)
+		foreach(pattern IN LISTS PATTERNS)
+			set(generated_name ${example}_${precision}_${pattern})
+			string(TOUPPER ${pattern} pattern)
+			add_executable(${generated_name} ${example}.cpp)
+			target_compile_definitions(${generated_name}
+				PRIVATE
+				SWEEPLB_PRECISION=${precision}
+				SWEEPLB_PATTERN=${pattern})
+			target_compile_features(${generated_name}
+				PRIVATE cxx_std_20)
+			target_link_libraries(
+				${generated_name}
+				PUBLIC
+				  rt)
+			if(OpenMP_CXX_FOUND)
+				target_link_libraries(
+					${generated_name}
+					PUBLIC
+					  OpenMP::OpenMP_CXX
+				)
+			endif()
+	  endforeach()
+  endforeach()
+endforeach()
diff --git a/box.cpp b/box.cpp
new file mode 100644
index 0000000..ecb3296
--- /dev/null
+++ b/box.cpp
@@ -0,0 +1,73 @@
+#include "lattice.h"
+
+#include "LLBM/collide.h"
+#include "LLBM/initialize.h"
+#include "LLBM/bounce_back.h"
+
+#include <chrono>
+#include <iostream>
+
+#include "pattern/all.h"
+
+using T = SWEEPLB_PRECISION;
+using PATTERN = pattern::SWEEPLB_PATTERN<T>;
+
+void simulate(Cuboid cuboid, std::size_t nStep) {
+	const int nThread = omp_get_max_threads();
+
+	Lattice<PATTERN> lattice(cuboid);
+
+	std::vector<simd::Pack<T>::index_t> discontinuity;
+
+	LatticeMask<T> bulk_mask(lattice.volume());
+	LatticeMask<T> wall_mask(lattice.volume());
+
+	cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) {
+		if (std::abs(iX - cuboid[0]/2) < cuboid[0]/16 &&
+		    std::abs(iY - cuboid[1]/2) < cuboid[1]/16 &&
+		    std::abs(iZ - cuboid[2]/2) < cuboid[2]/16) {
+			discontinuity.emplace_back(iCell);
+		}
+		if (   iX > 0 && iX < cuboid[0]-1
+		    && iY > 0 && iY < cuboid[1]-1
+		    && iZ > 0 && iZ < cuboid[2]-1) {
+			bulk_mask.set(iCell, true);
+		} else {
+			wall_mask.set(iCell, true);
+		}
+	});
+
+	bulk_mask.serialize();
+	wall_mask.serialize();
+
+	lattice.apply<InitializeO>(discontinuity, 2.0);
+
+	T tau = 0.56;
+
+	auto start = std::chrono::steady_clock::now();
+
+	for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), wall_mask));
+		lattice.stream();
+	}
+
+	auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+		std::chrono::steady_clock::now() - start);
+
+	std::cout << cuboid[0]
+	          << ", " << nStep
+	          << ", " << nThread
+	          << ", " << (nStep * lattice.volume()) / (1e6 * duration.count())
+	          << std::endl;
+
+	lattice.write_momenta(bulk_mask, "result.vtk");
+}
+
+int main(int argc, char* argv[]) {
+	const std::size_t nX     = atoi(argv[1]);
+	const std::size_t nY     = atoi(argv[2]);
+	const std::size_t nZ     = atoi(argv[3]);
+	const std::size_t steps = atoi(argv[4]);
+	simulate({ nX, nY, nZ }, steps);
+}
diff --git a/channel.cpp b/channel.cpp
new file mode 100644
index 0000000..fddb9a1
--- /dev/null
+++ b/channel.cpp
@@ -0,0 +1,87 @@
+#include "lattice.h"
+
+#include "LLBM/collide.h"
+#include "LLBM/initialize.h"
+#include "LLBM/bounce_back.h"
+#include "LLBM/bounce_back_moving_wall.h"
+#include "LLBM/equilibrium_density_wall.h"
+#include "LLBM/equilibrium_velocity_wall.h"
+
+#include <chrono>
+#include <iostream>
+
+#include "pattern/all.h"
+
+using T = SWEEPLB_PRECISION;
+using PATTERN = pattern::SWEEPLB_PATTERN<T>;
+
+void simulate(Cuboid cuboid, std::size_t nStep) {
+	const int nThread = omp_get_max_threads();
+
+	Lattice<PATTERN> lattice(cuboid);
+
+	LatticeMask<T> bulk_mask(cuboid.volume());
+	LatticeMask<T> wall_mask(cuboid.volume());
+	LatticeMask<T> inflow_mask(cuboid.volume());
+	LatticeMask<T> outflow_mask(cuboid.volume());
+
+	cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) {
+		if (   iY == 0 || iY == cuboid[1]-1
+		    || iZ == 0 || iZ == cuboid[2]-1) {
+			wall_mask.set(iCell, true);
+		} else if (iX == 0) {
+			inflow_mask.set(iCell, true);
+		} else if (iX == cuboid[0]-1) {
+			outflow_mask.set(iCell, true);
+		} else {
+			bulk_mask.set(iCell, true);
+		}
+	});
+
+	bulk_mask.serialize();
+	wall_mask.serialize();
+	inflow_mask.serialize();
+	outflow_mask.serialize();
+
+	T tau = 0.56;
+
+	T u_inflow  = 0.05;
+	T d_outflow = 1.;
+
+	for (std::size_t iStep = 0; iStep < 100; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), wall_mask),
+		              Operator(EquilibriumVelocityWallO(), inflow_mask, u_inflow, WallNormal<1,0,0>()),
+		              Operator(EquilibriumDensityWallO(), outflow_mask, d_outflow, WallNormal<-1,0,0>()));
+		lattice.stream();
+	}
+
+	auto start = std::chrono::steady_clock::now();
+
+	for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), wall_mask),
+		              Operator(EquilibriumVelocityWallO(), inflow_mask, u_inflow, WallNormal<1,0,0>()),
+		              Operator(EquilibriumDensityWallO(), outflow_mask, d_outflow, WallNormal<-1,0,0>()));
+		lattice.stream();
+	}
+
+	auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+	  std::chrono::steady_clock::now() - start);
+
+	std::cout << cuboid[0] << ", " << cuboid[1] << ", " << cuboid[2]
+	          << ", " << nStep
+	          << ", " << nThread
+	          << ", " << (nStep * lattice.volume()) / (1e6 * duration.count())
+	          << std::endl;
+
+	lattice.write_momenta(bulk_mask, "result.vtk");
+}
+
+int main(int argc, char* argv[]) {
+	const std::size_t nX    = atoi(argv[1]);
+	const std::size_t nY    = atoi(argv[2]);
+	const std::size_t nZ    = atoi(argv[3]);
+	const std::size_t steps = atoi(argv[4]);
+	simulate({ nX, nY, nZ }, steps);
+}
diff --git a/ldc.cpp b/ldc.cpp
new file mode 100644
index 0000000..0917939
--- /dev/null
+++ b/ldc.cpp
@@ -0,0 +1,76 @@
+#include "lattice.h"
+
+#include "LLBM/collide.h"
+#include "LLBM/initialize.h"
+#include "LLBM/bounce_back.h"
+#include "LLBM/bounce_back_moving_wall.h"
+
+#include <chrono>
+#include <iostream>
+
+#include "pattern/all.h"
+
+using T = SWEEPLB_PRECISION;
+using PATTERN = pattern::SWEEPLB_PATTERN<T>;
+
+void simulate(Cuboid cuboid, std::size_t nStep) {
+	const int nThread = omp_get_max_threads();
+
+	Lattice<PATTERN> lattice(cuboid);
+
+	LatticeMask<T> bulk_mask(cuboid.volume());
+	LatticeMask<T> box_mask(cuboid.volume());
+	LatticeMask<T> lid_mask(cuboid.volume());
+
+	cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) {
+		if (iZ == cuboid[2]-1) {
+			lid_mask.set(iCell, true);
+		} else if (iX == 0 || iX == cuboid[0]-1
+			      || iY == 0 || iY == cuboid[1]-1
+			      || iZ == 0) {
+			box_mask.set(iCell, true);
+		} else {
+			bulk_mask.set(iCell, true);
+		}
+	});
+
+	bulk_mask.serialize();
+	box_mask.serialize();
+	lid_mask.serialize();
+
+	T tau = 0.51;
+	T u_lid[] { 0.05, 0., 0. };
+
+	for (std::size_t iStep = 0; iStep < 100; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), box_mask),
+		              Operator(BounceBackMovingWallO(), lid_mask, u_lid));
+		lattice.stream();
+	}
+
+	auto start = std::chrono::steady_clock::now();
+
+	for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+		lattice.apply(Operator(BgkCollideO(), bulk_mask, tau),
+		              Operator(BounceBackO(), box_mask),
+		              Operator(BounceBackMovingWallO(), lid_mask, u_lid));
+		lattice.stream();
+	}
+
+	auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+		std::chrono::steady_clock::now() - start);
+
+	std::cout << cuboid[0]
+	          << ", " << nStep
+	          << ", " << nThread
+	          << ", " << (nStep * lattice.volume()) / (1e6 * duration.count())
+	          << std::endl;
+
+	lattice.write_momenta(bulk_mask, "result.vtk");
+}
+
+int main(int argc, char* argv[]) {
+	const std::size_t n     = atoi(argv[1]);
+	const std::size_t steps = atoi(argv[2]);
+	simulate({ n, n, n}, steps);
+}
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000..d156272
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,15 @@
+{ pkgs ? import <nixpkgs> { }, ... }:
+
+pkgs.stdenvNoCC.mkDerivation rec {
+  name = "sweeplb-env";
+  env = pkgs.buildEnv { name = name; paths = buildInputs; };
+
+  buildInputs = with pkgs; [
+    gcc10
+    cmake
+  ];
+
+  shellHook = ''
+    export NIX_SHELL_NAME="${name}"
+  '';
+}
diff --git a/src/LLBM/bounce_back.h b/src/LLBM/bounce_back.h
new file mode 100644
index 0000000..b710ee4
--- /dev/null
+++ b/src/LLBM/bounce_back.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "concepts.h"
+
+struct BounceBackO {
+
+template <concepts::Arithmetic V>
+static void apply(V f_curr[19], V f_next[19]) {
+	f_next[0] = f_curr[18];
+	f_next[1] = f_curr[17];
+	f_next[2] = f_curr[16];
+	f_next[3] = f_curr[15];
+	f_next[4] = f_curr[14];
+	f_next[5] = f_curr[13];
+	f_next[6] = f_curr[12];
+	f_next[7] = f_curr[11];
+	f_next[8] = f_curr[10];
+	f_next[9] = f_curr[9];
+	f_next[10] = f_curr[8];
+	f_next[11] = f_curr[7];
+	f_next[12] = f_curr[6];
+	f_next[13] = f_curr[5];
+	f_next[14] = f_curr[4];
+	f_next[15] = f_curr[3];
+	f_next[16] = f_curr[2];
+	f_next[17] = f_curr[1];
+	f_next[18] = f_curr[0];
+}
+
+};
diff --git a/src/LLBM/bounce_back_moving_wall.h b/src/LLBM/bounce_back_moving_wall.h
new file mode 100644
index 0000000..c00a17e
--- /dev/null
+++ b/src/LLBM/bounce_back_moving_wall.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "concepts.h"
+
+struct BounceBackMovingWallO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T u[3]) {
+	f_next[0] = f_curr[18] + T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2];
+	f_next[1] = f_curr[17] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2];
+	f_next[2] = f_curr[16] + T{0.333333333333333}*u[2];
+	f_next[3] = f_curr[15] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2];
+	f_next[4] = f_curr[14] - T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2];
+	f_next[5] = f_curr[13] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1];
+	f_next[6] = f_curr[12] + T{0.333333333333333}*u[1];
+	f_next[7] = f_curr[11] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1];
+	f_next[8] = f_curr[10] - T{0.333333333333333}*u[0];
+	f_next[9] = f_curr[9];
+	f_next[10] = f_curr[8] + T{0.333333333333333}*u[0];
+	f_next[11] = f_curr[7] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1];
+	f_next[12] = f_curr[6] - T{0.333333333333333}*u[1];
+	f_next[13] = f_curr[5] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1];
+	f_next[14] = f_curr[4] + T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2];
+	f_next[15] = f_curr[3] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2];
+	f_next[16] = f_curr[2] - T{0.333333333333333}*u[2];
+	f_next[17] = f_curr[1] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2];
+	f_next[18] = f_curr[0] - T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2];
+}
+
+};
diff --git a/src/LLBM/collect_moments.h b/src/LLBM/collect_moments.h
new file mode 100644
index 0000000..afd5d59
--- /dev/null
+++ b/src/LLBM/collect_moments.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "concepts.h"
+
+struct CollectMomentsF {
+
+static constexpr unsigned size = 4;
+
+template <concepts::Arithmetic V>
+static void observe(V f_curr[19], V* momenta) {
+	V m0 = f_curr[10] + f_curr[13] + f_curr[17];
+	V m1 = f_curr[14] + f_curr[5] + f_curr[6];
+	V m2 = f_curr[1] + f_curr[2] + f_curr[4];
+	V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+	V m4 = -f_curr[11] + f_curr[7];
+	V m5 = -f_curr[15] + f_curr[3];
+	V m6 = V{1}/m3;
+	V m7 = f_curr[0] - f_curr[18];
+	momenta[0] = m3;
+	momenta[1] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]);
+	momenta[2] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]);
+	momenta[3] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]);
+}
+
+};
diff --git a/src/LLBM/collide.h b/src/LLBM/collide.h
new file mode 100644
index 0000000..c08aade
--- /dev/null
+++ b/src/LLBM/collide.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "concepts.h"
+
+struct BgkCollideO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T tau) {
+	V rho;
+	V u[3];
+	V m0 = f_curr[10] + f_curr[13] + f_curr[17];
+	V m1 = f_curr[14] + f_curr[5] + f_curr[6];
+	V m2 = f_curr[1] + f_curr[2] + f_curr[4];
+	V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9];
+	V m4 = -f_curr[11] + f_curr[7];
+	V m5 = -f_curr[15] + f_curr[3];
+	V m6 = T{1}/m3;
+	V m7 = f_curr[0] - f_curr[18];
+	rho = m3;
+	u[0] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]);
+	u[1] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]);
+	u[2] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]);
+	V x0 = T{1}/tau;
+	V x1 = T{0.0138888888888889}*x0;
+	V x2 = u[1] + u[2];
+	V x3 = T{9.00000000000000}*(x2*x2);
+	V x4 = T{6.00000000000000}*u[2];
+	V x5 = u[1]*u[1];
+	V x6 = T{3.00000000000000}*x5;
+	V x7 = -x6;
+	V x8 = u[0]*u[0];
+	V x9 = T{3.00000000000000}*x8;
+	V x10 = T{2.00000000000000} - x9;
+	V x11 = x10 + x7;
+	V x12 = x11 + x4;
+	V x13 = u[2]*u[2];
+	V x14 = T{3.00000000000000}*x13;
+	V x15 = -x14;
+	V x16 = T{6.00000000000000}*u[1];
+	V x17 = x15 + x16;
+	V x18 = T{6.00000000000000}*u[0];
+	V x19 = -u[0];
+	V x20 = x19 + u[2];
+	V x21 = x14 + x6 + T{-2.00000000000000};
+	V x22 = x21 + x9;
+	V x23 = x22 - x4;
+	V x24 = T{0.0277777777777778}*x0;
+	V x25 = T{6.00000000000000}*x13;
+	V x26 = u[0] + u[2];
+	V x27 = T{9.00000000000000}*(x26*x26);
+	V x28 = x15 + x18;
+	V x29 = -u[1];
+	V x30 = x29 + u[2];
+	V x31 = x19 + u[1];
+	V x32 = -x16;
+	V x33 = x18 + x22;
+	V x34 = T{6.00000000000000}*x5;
+	V x35 = u[0] + u[1];
+	V x36 = T{9.00000000000000}*(x35*x35);
+	V x37 = T{6.00000000000000}*x8;
+	V x38 = x9 + T{-2.00000000000000};
+	V x39 = x29 + u[0];
+	V x40 = -x18 + x22;
+	V x41 = -u[2];
+	V x42 = x41 + u[1];
+	V x43 = x22 + x4;
+	V x44 = x41 + u[0];
+	f_next[0] = x1*(rho*(x12 + x17 + x3) - T{72.0000000000000}*f_curr[0]) + f_curr[0];
+	f_next[1] = -x1*(rho*(x18 + x23 - T{9.00000000000000}*x20*x20) + T{72.0000000000000}*f_curr[1]) + f_curr[1];
+	f_next[2] = x24*(rho*(x12 + x25) - T{36.0000000000000}*f_curr[2]) + f_curr[2];
+	f_next[3] = x1*(rho*(x12 + x27 + x28) - T{72.0000000000000}*f_curr[3]) + f_curr[3];
+	f_next[4] = -x1*(rho*(x16 + x23 - T{9.00000000000000}*x30*x30) + T{72.0000000000000}*f_curr[4]) + f_curr[4];
+	f_next[5] = -x1*(rho*(x32 + x33 - T{9.00000000000000}*x31*x31) + T{72.0000000000000}*f_curr[5]) + f_curr[5];
+	f_next[6] = x24*(rho*(x10 + x17 + x34) - T{36.0000000000000}*f_curr[6]) + f_curr[6];
+	f_next[7] = x1*(rho*(x11 + x17 + x18 + x36) - T{72.0000000000000}*f_curr[7]) + f_curr[7];
+	f_next[8] = -x24*(rho*(x18 + x21 - x37) + T{36.0000000000000}*f_curr[8]) + f_curr[8];
+	f_next[9] = -T{0.166666666666667}*x0*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9];
+	f_next[10] = x24*(rho*(x28 + x37 + x7 + T{2.00000000000000}) - T{36.0000000000000}*f_curr[10]) + f_curr[10];
+	f_next[11] = -x1*(rho*(x16 + x33 - x36) + T{72.0000000000000}*f_curr[11]) + f_curr[11];
+	f_next[12] = -x24*(rho*(x14 + x16 - x34 + x38) + T{36.0000000000000}*f_curr[12]) + f_curr[12];
+	f_next[13] = -x1*(rho*(x16 + x40 - T{9.00000000000000}*x39*x39) + T{72.0000000000000}*f_curr[13]) + f_curr[13];
+	f_next[14] = -x1*(rho*(x32 + x43 - T{9.00000000000000}*x42*x42) + T{72.0000000000000}*f_curr[14]) + f_curr[14];
+	f_next[15] = -x1*(rho*(-x27 + x33 + x4) + T{72.0000000000000}*f_curr[15]) + f_curr[15];
+	f_next[16] = -x24*(rho*(-x25 + x38 + x4 + x6) + T{36.0000000000000}*f_curr[16]) + f_curr[16];
+	f_next[17] = -x1*(rho*(x4 + x40 - T{9.00000000000000}*x44*x44) + T{72.0000000000000}*f_curr[17]) + f_curr[17];
+	f_next[18] = -x1*(rho*(x16 - x3 + x43) + T{72.0000000000000}*f_curr[18]) + f_curr[18];
+}
+
+};
diff --git a/src/LLBM/equilibrium_density_wall.h b/src/LLBM/equilibrium_density_wall.h
new file mode 100644
index 0000000..598d59d
--- /dev/null
+++ b/src/LLBM/equilibrium_density_wall.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "wall.h"
+#include "concepts.h"
+
+struct EquilibriumDensityWallO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<1,0,0>) {
+	V u_w = (rho_w - f_curr[0] - T{2.00000000000000}*f_curr[11] - f_curr[12] - f_curr[14] - T{2.00000000000000}*f_curr[15] - f_curr[16] - f_curr[18] - T{2.00000000000000}*f_curr[1] - f_curr[2] - f_curr[4] - T{2.00000000000000}*f_curr[5] - f_curr[6] - T{2.00000000000000}*f_curr[8] - f_curr[9])/rho_w;
+	V rho = rho_w;
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<-1,0,0>) {
+	V u_w = (-rho_w + f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/rho_w;
+	V rho = rho_w;
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+};
diff --git a/src/LLBM/equilibrium_velocity_wall.h b/src/LLBM/equilibrium_velocity_wall.h
new file mode 100644
index 0000000..2d68e34
--- /dev/null
+++ b/src/LLBM/equilibrium_velocity_wall.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "wall.h"
+#include "concepts.h"
+
+struct EquilibriumVelocityWallO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<1,0,0>) {
+	V rho;
+	rho = -(f_curr[0] + T{2.00000000000000}*f_curr[11] + f_curr[12] + f_curr[14] + T{2.00000000000000}*f_curr[15] + f_curr[16] + f_curr[18] + T{2.00000000000000}*f_curr[1] + f_curr[2] + f_curr[4] + T{2.00000000000000}*f_curr[5] + f_curr[6] + T{2.00000000000000}*f_curr[8] + f_curr[9])/(u_w + T{-1.00000000000000});
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+template <concepts::Arithmetic V, concepts::Arithmetic T>
+static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<-1,0,0>) {
+	V rho;
+	rho = (f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/(u_w + T{1.00000000000000});
+	V u[3] { u_w, 0., 0. };
+	V e0 = T{0.0277777777777778}*rho;
+	V e1 = u[1] + u[2];
+	V e2 = T{4.50000000000000}*(e1*e1);
+	V e3 = T{3.00000000000000}*u[2];
+	V e4 = u[1]*u[1];
+	V e5 = T{1.50000000000000}*e4;
+	V e6 = -e5;
+	V e7 = u[0]*u[0];
+	V e8 = T{1.50000000000000}*e7;
+	V e9 = T{1.00000000000000} - e8;
+	V e10 = e6 + e9;
+	V e11 = e10 + e3;
+	V e12 = T{3.00000000000000}*u[1];
+	V e13 = u[2]*u[2];
+	V e14 = T{1.50000000000000}*e13;
+	V e15 = -e14;
+	V e16 = e12 + e15;
+	V e17 = T{3.00000000000000}*u[0];
+	V e18 = -u[2];
+	V e19 = e18 + u[0];
+	V e20 = -T{4.50000000000000}*e19*e19;
+	V e21 = e14 + e5 + T{-1.00000000000000};
+	V e22 = e21 + e8;
+	V e23 = e22 - e3;
+	V e24 = T{0.0555555555555556}*rho;
+	V e25 = T{3.00000000000000}*e13;
+	V e26 = u[0] + u[2];
+	V e27 = T{4.50000000000000}*(e26*e26);
+	V e28 = e15 + e17;
+	V e29 = e18 + u[1];
+	V e30 = -T{4.50000000000000}*e29*e29;
+	V e31 = -e12;
+	V e32 = u[0] - u[1];
+	V e33 = -T{4.50000000000000}*e32*e32;
+	V e34 = e17 + e22;
+	V e35 = T{3.00000000000000}*e4;
+	V e36 = u[0] + u[1];
+	V e37 = T{4.50000000000000}*(e36*e36);
+	V e38 = T{3.00000000000000}*e7;
+	V e39 = e8 + T{-1.00000000000000};
+	V e40 = -e17 + e22;
+	V e41 = e22 + e3;
+	f_next[0] = e0*(e11 + e16 + e2);
+	f_next[1] = -e0*(e17 + e20 + e23);
+	f_next[2] = e24*(e11 + e25);
+	f_next[3] = e0*(e11 + e27 + e28);
+	f_next[4] = -e0*(e12 + e23 + e30);
+	f_next[5] = -e0*(e31 + e33 + e34);
+	f_next[6] = e24*(e16 + e35 + e9);
+	f_next[7] = e0*(e10 + e16 + e17 + e37);
+	f_next[8] = -e24*(e17 + e21 - e38);
+	f_next[9] = -T{0.333333333333333}*e22*rho;
+	f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000});
+	f_next[11] = -e0*(e12 + e34 - e37);
+	f_next[12] = -e24*(e12 + e14 - e35 + e39);
+	f_next[13] = -e0*(e12 + e33 + e40);
+	f_next[14] = -e0*(e30 + e31 + e41);
+	f_next[15] = -e0*(-e27 + e3 + e34);
+	f_next[16] = -e24*(-e25 + e3 + e39 + e5);
+	f_next[17] = -e0*(e20 + e3 + e40);
+	f_next[18] = -e0*(e12 - e2 + e41);
+}
+
+};
diff --git a/src/LLBM/initialize.h b/src/LLBM/initialize.h
new file mode 100644
index 0000000..b361217
--- /dev/null
+++ b/src/LLBM/initialize.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "concepts.h"
+
+struct InitializeO {
+
+template <concepts::Arithmetic V, concepts::Arithmetic T = V>
+static void apply(V f_curr[19], V f_next[19]) {
+	f_next[0] = T{0.0277777777777778};
+	f_next[1] = T{0.0277777777777778};
+	f_next[2] = T{0.0555555555555556};
+	f_next[3] = T{0.0277777777777778};
+	f_next[4] = T{0.0277777777777778};
+	f_next[5] = T{0.0277777777777778};
+	f_next[6] = T{0.0555555555555556};
+	f_next[7] = T{0.0277777777777778};
+	f_next[8] = T{0.0555555555555556};
+	f_next[9] = T{0.333333333333333};
+	f_next[10] = T{0.0555555555555556};
+	f_next[11] = T{0.0277777777777778};
+	f_next[12] = T{0.0555555555555556};
+	f_next[13] = T{0.0277777777777778};
+	f_next[14] = T{0.0277777777777778};
+	f_next[15] = T{0.0277777777777778};
+	f_next[16] = T{0.0555555555555556};
+	f_next[17] = T{0.0277777777777778};
+	f_next[18] = T{0.0277777777777778};
+}
+
+template <concepts::Arithmetic V, concepts::Arithmetic T = V>
+static void apply(V f_curr[19], V f_next[19], T scale) {
+	f_next[0]  = T{scale * 0.0277777777777778};
+	f_next[1]  = T{scale * 0.0277777777777778};
+	f_next[2]  = T{scale * 0.0555555555555556};
+	f_next[3]  = T{scale * 0.0277777777777778};
+	f_next[4]  = T{scale * 0.0277777777777778};
+	f_next[5]  = T{scale * 0.0277777777777778};
+	f_next[6]  = T{scale * 0.0555555555555556};
+	f_next[7]  = T{scale * 0.0277777777777778};
+	f_next[8]  = T{scale * 0.0555555555555556};
+	f_next[9]  = T{scale * 0.333333333333333};
+	f_next[10] = T{scale * 0.0555555555555556};
+	f_next[11] = T{scale * 0.0277777777777778};
+	f_next[12] = T{scale * 0.0555555555555556};
+	f_next[13] = T{scale * 0.0277777777777778};
+	f_next[14] = T{scale * 0.0277777777777778};
+	f_next[15] = T{scale * 0.0277777777777778};
+	f_next[16] = T{scale * 0.0555555555555556};