From a92271176a19e06611099c0eccc4e6a6887f4915 Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Mon, 17 May 2021 00:30:13 +0200 Subject: Extract public version of SweepLB --- .editorconfig | 2 + CMakeLists.txt | 72 +++++++ box.cpp | 73 +++++++ channel.cpp | 87 ++++++++ ldc.cpp | 76 +++++++ shell.nix | 15 ++ src/LLBM/bounce_back.h | 30 +++ src/LLBM/bounce_back_moving_wall.h | 30 +++ src/LLBM/collect_moments.h | 25 +++ src/LLBM/collide.h | 89 ++++++++ src/LLBM/equilibrium_density_wall.h | 144 +++++++++++++ src/LLBM/equilibrium_velocity_wall.h | 144 +++++++++++++ src/LLBM/initialize.h | 53 +++++ src/LLBM/smagorinsky_collide.h | 129 ++++++++++++ src/LLBM/wall.h | 4 + src/concepts.h | 22 ++ src/cuboid.h | 37 ++++ src/export.h | 59 ++++++ src/lattice.h | 267 ++++++++++++++++++++++++ src/operator.h | 41 ++++ src/pattern/all.h | 5 + src/pattern/none.h | 48 +++++ src/pattern/ps.h | 123 ++++++++++++ src/pattern/sss.h | 63 ++++++ src/population.h | 41 ++++ src/propagation.h | 28 +++ src/simd/256.h | 379 +++++++++++++++++++++++++++++++++++ src/simd/512.h | 339 +++++++++++++++++++++++++++++++ 28 files changed, 2425 insertions(+) create mode 100644 .editorconfig create mode 100644 CMakeLists.txt create mode 100644 box.cpp create mode 100644 channel.cpp create mode 100644 ldc.cpp create mode 100644 shell.nix create mode 100644 src/LLBM/bounce_back.h create mode 100644 src/LLBM/bounce_back_moving_wall.h create mode 100644 src/LLBM/collect_moments.h create mode 100644 src/LLBM/collide.h create mode 100644 src/LLBM/equilibrium_density_wall.h create mode 100644 src/LLBM/equilibrium_velocity_wall.h create mode 100644 src/LLBM/initialize.h create mode 100644 src/LLBM/smagorinsky_collide.h create mode 100644 src/LLBM/wall.h create mode 100644 src/concepts.h create mode 100644 src/cuboid.h create mode 100644 src/export.h create mode 100644 src/lattice.h create mode 100644 src/operator.h create mode 100644 src/pattern/all.h create mode 100644 src/pattern/none.h create mode 100644 src/pattern/ps.h create mode 100644 src/pattern/sss.h create mode 100644 src/population.h create mode 100644 src/propagation.h create mode 100644 src/simd/256.h create mode 100644 src/simd/512.h diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..0b225af --- /dev/null +++ b/.editorconfig @@ -0,0 +1,2 @@ +[*] +indent_style = tab diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c5b383b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,72 @@ +cmake_minimum_required(VERSION 3.10) +project(SweepLB LANGUAGES CXX) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(SIMD_MODE "AVX256" CACHE STRING "Generate either AVX256 or AVX512 instructions") +option(USE_SIMD_CELL_LIST "Generate gather / scatter instructions for cell list processing" OFF) + +if(NOT SIMD_MODE) + set(SIMD_MODE "AVX256") +endif() + +set(CXX_FLAGS "-O3 -march=native -mtune=native") + +if(${SIMD_MODE} MATCHES "AVX256") + message("Selecting AVX256") + set(SIMD_MODE_FLAGS "-mavx2") +elseif(${SIMD_MODE} MATCHES "AVX512") + message("Selecting AVX512") + add_compile_definitions(ENABLE_AVX512) + set(SIMD_MODE_FLAGS "-mavx512f -mavx512dq") +else() + message(FATAL_ERROR "Invalid SIMD mode, choose either AVX256 or AVX512") +endif() + +if(USE_SIMD_CELL_LIST) + message("Enabling SIMD cell list processing.") + add_compile_definitions(SIMD_CELL_LIST) +endif() + +set(CMAKE_CXX_FLAGS "${CXX_FLAGS} ${SIMD_MODE_FLAGS}") + +message("Using flags \"${CMAKE_CXX_FLAGS}\"") + +include_directories( + src/ +) + +find_package(OpenMP) + +set(EXAMPLES ldc box channel) +set(PRECISIONS float double) +set(PATTERNS sss ps) + +foreach(example IN LISTS EXAMPLES) + foreach(precision IN LISTS PRECISIONS) + foreach(pattern IN LISTS PATTERNS) + set(generated_name ${example}_${precision}_${pattern}) + string(TOUPPER ${pattern} pattern) + add_executable(${generated_name} ${example}.cpp) + target_compile_definitions(${generated_name} + PRIVATE + SWEEPLB_PRECISION=${precision} + SWEEPLB_PATTERN=${pattern}) + target_compile_features(${generated_name} + PRIVATE cxx_std_20) + target_link_libraries( + ${generated_name} + PUBLIC + rt) + if(OpenMP_CXX_FOUND) + target_link_libraries( + ${generated_name} + PUBLIC + OpenMP::OpenMP_CXX + ) + endif() + endforeach() + endforeach() +endforeach() diff --git a/box.cpp b/box.cpp new file mode 100644 index 0000000..ecb3296 --- /dev/null +++ b/box.cpp @@ -0,0 +1,73 @@ +#include "lattice.h" + +#include "LLBM/collide.h" +#include "LLBM/initialize.h" +#include "LLBM/bounce_back.h" + +#include +#include + +#include "pattern/all.h" + +using T = SWEEPLB_PRECISION; +using PATTERN = pattern::SWEEPLB_PATTERN; + +void simulate(Cuboid cuboid, std::size_t nStep) { + const int nThread = omp_get_max_threads(); + + Lattice lattice(cuboid); + + std::vector::index_t> discontinuity; + + LatticeMask bulk_mask(lattice.volume()); + LatticeMask wall_mask(lattice.volume()); + + cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) { + if (std::abs(iX - cuboid[0]/2) < cuboid[0]/16 && + std::abs(iY - cuboid[1]/2) < cuboid[1]/16 && + std::abs(iZ - cuboid[2]/2) < cuboid[2]/16) { + discontinuity.emplace_back(iCell); + } + if ( iX > 0 && iX < cuboid[0]-1 + && iY > 0 && iY < cuboid[1]-1 + && iZ > 0 && iZ < cuboid[2]-1) { + bulk_mask.set(iCell, true); + } else { + wall_mask.set(iCell, true); + } + }); + + bulk_mask.serialize(); + wall_mask.serialize(); + + lattice.apply(discontinuity, 2.0); + + T tau = 0.56; + + auto start = std::chrono::steady_clock::now(); + + for (std::size_t iStep = 0; iStep < nStep; ++iStep) { + lattice.apply(Operator(BgkCollideO(), bulk_mask, tau), + Operator(BounceBackO(), wall_mask)); + lattice.stream(); + } + + auto duration = std::chrono::duration_cast>( + std::chrono::steady_clock::now() - start); + + std::cout << cuboid[0] + << ", " << nStep + << ", " << nThread + << ", " << (nStep * lattice.volume()) / (1e6 * duration.count()) + << std::endl; + + lattice.write_momenta(bulk_mask, "result.vtk"); +} + +int main(int argc, char* argv[]) { + const std::size_t nX = atoi(argv[1]); + const std::size_t nY = atoi(argv[2]); + const std::size_t nZ = atoi(argv[3]); + const std::size_t steps = atoi(argv[4]); + simulate({ nX, nY, nZ }, steps); +} diff --git a/channel.cpp b/channel.cpp new file mode 100644 index 0000000..fddb9a1 --- /dev/null +++ b/channel.cpp @@ -0,0 +1,87 @@ +#include "lattice.h" + +#include "LLBM/collide.h" +#include "LLBM/initialize.h" +#include "LLBM/bounce_back.h" +#include "LLBM/bounce_back_moving_wall.h" +#include "LLBM/equilibrium_density_wall.h" +#include "LLBM/equilibrium_velocity_wall.h" + +#include +#include + +#include "pattern/all.h" + +using T = SWEEPLB_PRECISION; +using PATTERN = pattern::SWEEPLB_PATTERN; + +void simulate(Cuboid cuboid, std::size_t nStep) { + const int nThread = omp_get_max_threads(); + + Lattice lattice(cuboid); + + LatticeMask bulk_mask(cuboid.volume()); + LatticeMask wall_mask(cuboid.volume()); + LatticeMask inflow_mask(cuboid.volume()); + LatticeMask outflow_mask(cuboid.volume()); + + cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) { + if ( iY == 0 || iY == cuboid[1]-1 + || iZ == 0 || iZ == cuboid[2]-1) { + wall_mask.set(iCell, true); + } else if (iX == 0) { + inflow_mask.set(iCell, true); + } else if (iX == cuboid[0]-1) { + outflow_mask.set(iCell, true); + } else { + bulk_mask.set(iCell, true); + } + }); + + bulk_mask.serialize(); + wall_mask.serialize(); + inflow_mask.serialize(); + outflow_mask.serialize(); + + T tau = 0.56; + + T u_inflow = 0.05; + T d_outflow = 1.; + + for (std::size_t iStep = 0; iStep < 100; ++iStep) { + lattice.apply(Operator(BgkCollideO(), bulk_mask, tau), + Operator(BounceBackO(), wall_mask), + Operator(EquilibriumVelocityWallO(), inflow_mask, u_inflow, WallNormal<1,0,0>()), + Operator(EquilibriumDensityWallO(), outflow_mask, d_outflow, WallNormal<-1,0,0>())); + lattice.stream(); + } + + auto start = std::chrono::steady_clock::now(); + + for (std::size_t iStep = 0; iStep < nStep; ++iStep) { + lattice.apply(Operator(BgkCollideO(), bulk_mask, tau), + Operator(BounceBackO(), wall_mask), + Operator(EquilibriumVelocityWallO(), inflow_mask, u_inflow, WallNormal<1,0,0>()), + Operator(EquilibriumDensityWallO(), outflow_mask, d_outflow, WallNormal<-1,0,0>())); + lattice.stream(); + } + + auto duration = std::chrono::duration_cast>( + std::chrono::steady_clock::now() - start); + + std::cout << cuboid[0] << ", " << cuboid[1] << ", " << cuboid[2] + << ", " << nStep + << ", " << nThread + << ", " << (nStep * lattice.volume()) / (1e6 * duration.count()) + << std::endl; + + lattice.write_momenta(bulk_mask, "result.vtk"); +} + +int main(int argc, char* argv[]) { + const std::size_t nX = atoi(argv[1]); + const std::size_t nY = atoi(argv[2]); + const std::size_t nZ = atoi(argv[3]); + const std::size_t steps = atoi(argv[4]); + simulate({ nX, nY, nZ }, steps); +} diff --git a/ldc.cpp b/ldc.cpp new file mode 100644 index 0000000..0917939 --- /dev/null +++ b/ldc.cpp @@ -0,0 +1,76 @@ +#include "lattice.h" + +#include "LLBM/collide.h" +#include "LLBM/initialize.h" +#include "LLBM/bounce_back.h" +#include "LLBM/bounce_back_moving_wall.h" + +#include +#include + +#include "pattern/all.h" + +using T = SWEEPLB_PRECISION; +using PATTERN = pattern::SWEEPLB_PATTERN; + +void simulate(Cuboid cuboid, std::size_t nStep) { + const int nThread = omp_get_max_threads(); + + Lattice lattice(cuboid); + + LatticeMask bulk_mask(cuboid.volume()); + LatticeMask box_mask(cuboid.volume()); + LatticeMask lid_mask(cuboid.volume()); + + cuboid.traverse([&](int iX, int iY, int iZ, std::size_t iCell) { + if (iZ == cuboid[2]-1) { + lid_mask.set(iCell, true); + } else if (iX == 0 || iX == cuboid[0]-1 + || iY == 0 || iY == cuboid[1]-1 + || iZ == 0) { + box_mask.set(iCell, true); + } else { + bulk_mask.set(iCell, true); + } + }); + + bulk_mask.serialize(); + box_mask.serialize(); + lid_mask.serialize(); + + T tau = 0.51; + T u_lid[] { 0.05, 0., 0. }; + + for (std::size_t iStep = 0; iStep < 100; ++iStep) { + lattice.apply(Operator(BgkCollideO(), bulk_mask, tau), + Operator(BounceBackO(), box_mask), + Operator(BounceBackMovingWallO(), lid_mask, u_lid)); + lattice.stream(); + } + + auto start = std::chrono::steady_clock::now(); + + for (std::size_t iStep = 0; iStep < nStep; ++iStep) { + lattice.apply(Operator(BgkCollideO(), bulk_mask, tau), + Operator(BounceBackO(), box_mask), + Operator(BounceBackMovingWallO(), lid_mask, u_lid)); + lattice.stream(); + } + + auto duration = std::chrono::duration_cast>( + std::chrono::steady_clock::now() - start); + + std::cout << cuboid[0] + << ", " << nStep + << ", " << nThread + << ", " << (nStep * lattice.volume()) / (1e6 * duration.count()) + << std::endl; + + lattice.write_momenta(bulk_mask, "result.vtk"); +} + +int main(int argc, char* argv[]) { + const std::size_t n = atoi(argv[1]); + const std::size_t steps = atoi(argv[2]); + simulate({ n, n, n}, steps); +} diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..d156272 --- /dev/null +++ b/shell.nix @@ -0,0 +1,15 @@ +{ pkgs ? import { }, ... }: + +pkgs.stdenvNoCC.mkDerivation rec { + name = "sweeplb-env"; + env = pkgs.buildEnv { name = name; paths = buildInputs; }; + + buildInputs = with pkgs; [ + gcc10 + cmake + ]; + + shellHook = '' + export NIX_SHELL_NAME="${name}" + ''; +} diff --git a/src/LLBM/bounce_back.h b/src/LLBM/bounce_back.h new file mode 100644 index 0000000..b710ee4 --- /dev/null +++ b/src/LLBM/bounce_back.h @@ -0,0 +1,30 @@ +#pragma once + +#include "concepts.h" + +struct BounceBackO { + +template +static void apply(V f_curr[19], V f_next[19]) { + f_next[0] = f_curr[18]; + f_next[1] = f_curr[17]; + f_next[2] = f_curr[16]; + f_next[3] = f_curr[15]; + f_next[4] = f_curr[14]; + f_next[5] = f_curr[13]; + f_next[6] = f_curr[12]; + f_next[7] = f_curr[11]; + f_next[8] = f_curr[10]; + f_next[9] = f_curr[9]; + f_next[10] = f_curr[8]; + f_next[11] = f_curr[7]; + f_next[12] = f_curr[6]; + f_next[13] = f_curr[5]; + f_next[14] = f_curr[4]; + f_next[15] = f_curr[3]; + f_next[16] = f_curr[2]; + f_next[17] = f_curr[1]; + f_next[18] = f_curr[0]; +} + +}; diff --git a/src/LLBM/bounce_back_moving_wall.h b/src/LLBM/bounce_back_moving_wall.h new file mode 100644 index 0000000..c00a17e --- /dev/null +++ b/src/LLBM/bounce_back_moving_wall.h @@ -0,0 +1,30 @@ +#pragma once + +#include "concepts.h" + +struct BounceBackMovingWallO { + +template +static void apply(V f_curr[19], V f_next[19], T u[3]) { + f_next[0] = f_curr[18] + T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2]; + f_next[1] = f_curr[17] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2]; + f_next[2] = f_curr[16] + T{0.333333333333333}*u[2]; + f_next[3] = f_curr[15] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2]; + f_next[4] = f_curr[14] - T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2]; + f_next[5] = f_curr[13] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1]; + f_next[6] = f_curr[12] + T{0.333333333333333}*u[1]; + f_next[7] = f_curr[11] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1]; + f_next[8] = f_curr[10] - T{0.333333333333333}*u[0]; + f_next[9] = f_curr[9]; + f_next[10] = f_curr[8] + T{0.333333333333333}*u[0]; + f_next[11] = f_curr[7] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1]; + f_next[12] = f_curr[6] - T{0.333333333333333}*u[1]; + f_next[13] = f_curr[5] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1]; + f_next[14] = f_curr[4] + T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2]; + f_next[15] = f_curr[3] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2]; + f_next[16] = f_curr[2] - T{0.333333333333333}*u[2]; + f_next[17] = f_curr[1] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2]; + f_next[18] = f_curr[0] - T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2]; +} + +}; diff --git a/src/LLBM/collect_moments.h b/src/LLBM/collect_moments.h new file mode 100644 index 0000000..afd5d59 --- /dev/null +++ b/src/LLBM/collect_moments.h @@ -0,0 +1,25 @@ +#pragma once + +#include "concepts.h" + +struct CollectMomentsF { + +static constexpr unsigned size = 4; + +template +static void observe(V f_curr[19], V* momenta) { + V m0 = f_curr[10] + f_curr[13] + f_curr[17]; + V m1 = f_curr[14] + f_curr[5] + f_curr[6]; + V m2 = f_curr[1] + f_curr[2] + f_curr[4]; + V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9]; + V m4 = -f_curr[11] + f_curr[7]; + V m5 = -f_curr[15] + f_curr[3]; + V m6 = V{1}/m3; + V m7 = f_curr[0] - f_curr[18]; + momenta[0] = m3; + momenta[1] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]); + momenta[2] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]); + momenta[3] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]); +} + +}; diff --git a/src/LLBM/collide.h b/src/LLBM/collide.h new file mode 100644 index 0000000..c08aade --- /dev/null +++ b/src/LLBM/collide.h @@ -0,0 +1,89 @@ +#pragma once + +#include "concepts.h" + +struct BgkCollideO { + +template +static void apply(V f_curr[19], V f_next[19], T tau) { + V rho; + V u[3]; + V m0 = f_curr[10] + f_curr[13] + f_curr[17]; + V m1 = f_curr[14] + f_curr[5] + f_curr[6]; + V m2 = f_curr[1] + f_curr[2] + f_curr[4]; + V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9]; + V m4 = -f_curr[11] + f_curr[7]; + V m5 = -f_curr[15] + f_curr[3]; + V m6 = T{1}/m3; + V m7 = f_curr[0] - f_curr[18]; + rho = m3; + u[0] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]); + u[1] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]); + u[2] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]); + V x0 = T{1}/tau; + V x1 = T{0.0138888888888889}*x0; + V x2 = u[1] + u[2]; + V x3 = T{9.00000000000000}*(x2*x2); + V x4 = T{6.00000000000000}*u[2]; + V x5 = u[1]*u[1]; + V x6 = T{3.00000000000000}*x5; + V x7 = -x6; + V x8 = u[0]*u[0]; + V x9 = T{3.00000000000000}*x8; + V x10 = T{2.00000000000000} - x9; + V x11 = x10 + x7; + V x12 = x11 + x4; + V x13 = u[2]*u[2]; + V x14 = T{3.00000000000000}*x13; + V x15 = -x14; + V x16 = T{6.00000000000000}*u[1]; + V x17 = x15 + x16; + V x18 = T{6.00000000000000}*u[0]; + V x19 = -u[0]; + V x20 = x19 + u[2]; + V x21 = x14 + x6 + T{-2.00000000000000}; + V x22 = x21 + x9; + V x23 = x22 - x4; + V x24 = T{0.0277777777777778}*x0; + V x25 = T{6.00000000000000}*x13; + V x26 = u[0] + u[2]; + V x27 = T{9.00000000000000}*(x26*x26); + V x28 = x15 + x18; + V x29 = -u[1]; + V x30 = x29 + u[2]; + V x31 = x19 + u[1]; + V x32 = -x16; + V x33 = x18 + x22; + V x34 = T{6.00000000000000}*x5; + V x35 = u[0] + u[1]; + V x36 = T{9.00000000000000}*(x35*x35); + V x37 = T{6.00000000000000}*x8; + V x38 = x9 + T{-2.00000000000000}; + V x39 = x29 + u[0]; + V x40 = -x18 + x22; + V x41 = -u[2]; + V x42 = x41 + u[1]; + V x43 = x22 + x4; + V x44 = x41 + u[0]; + f_next[0] = x1*(rho*(x12 + x17 + x3) - T{72.0000000000000}*f_curr[0]) + f_curr[0]; + f_next[1] = -x1*(rho*(x18 + x23 - T{9.00000000000000}*x20*x20) + T{72.0000000000000}*f_curr[1]) + f_curr[1]; + f_next[2] = x24*(rho*(x12 + x25) - T{36.0000000000000}*f_curr[2]) + f_curr[2]; + f_next[3] = x1*(rho*(x12 + x27 + x28) - T{72.0000000000000}*f_curr[3]) + f_curr[3]; + f_next[4] = -x1*(rho*(x16 + x23 - T{9.00000000000000}*x30*x30) + T{72.0000000000000}*f_curr[4]) + f_curr[4]; + f_next[5] = -x1*(rho*(x32 + x33 - T{9.00000000000000}*x31*x31) + T{72.0000000000000}*f_curr[5]) + f_curr[5]; + f_next[6] = x24*(rho*(x10 + x17 + x34) - T{36.0000000000000}*f_curr[6]) + f_curr[6]; + f_next[7] = x1*(rho*(x11 + x17 + x18 + x36) - T{72.0000000000000}*f_curr[7]) + f_curr[7]; + f_next[8] = -x24*(rho*(x18 + x21 - x37) + T{36.0000000000000}*f_curr[8]) + f_curr[8]; + f_next[9] = -T{0.166666666666667}*x0*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9]; + f_next[10] = x24*(rho*(x28 + x37 + x7 + T{2.00000000000000}) - T{36.0000000000000}*f_curr[10]) + f_curr[10]; + f_next[11] = -x1*(rho*(x16 + x33 - x36) + T{72.0000000000000}*f_curr[11]) + f_curr[11]; + f_next[12] = -x24*(rho*(x14 + x16 - x34 + x38) + T{36.0000000000000}*f_curr[12]) + f_curr[12]; + f_next[13] = -x1*(rho*(x16 + x40 - T{9.00000000000000}*x39*x39) + T{72.0000000000000}*f_curr[13]) + f_curr[13]; + f_next[14] = -x1*(rho*(x32 + x43 - T{9.00000000000000}*x42*x42) + T{72.0000000000000}*f_curr[14]) + f_curr[14]; + f_next[15] = -x1*(rho*(-x27 + x33 + x4) + T{72.0000000000000}*f_curr[15]) + f_curr[15]; + f_next[16] = -x24*(rho*(-x25 + x38 + x4 + x6) + T{36.0000000000000}*f_curr[16]) + f_curr[16]; + f_next[17] = -x1*(rho*(x4 + x40 - T{9.00000000000000}*x44*x44) + T{72.0000000000000}*f_curr[17]) + f_curr[17]; + f_next[18] = -x1*(rho*(x16 - x3 + x43) + T{72.0000000000000}*f_curr[18]) + f_curr[18]; +} + +}; diff --git a/src/LLBM/equilibrium_density_wall.h b/src/LLBM/equilibrium_density_wall.h new file mode 100644 index 0000000..598d59d --- /dev/null +++ b/src/LLBM/equilibrium_density_wall.h @@ -0,0 +1,144 @@ +#pragma once + +#include "wall.h" +#include "concepts.h" + +struct EquilibriumDensityWallO { + +template +static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<1,0,0>) { + V u_w = (rho_w - f_curr[0] - T{2.00000000000000}*f_curr[11] - f_curr[12] - f_curr[14] - T{2.00000000000000}*f_curr[15] - f_curr[16] - f_curr[18] - T{2.00000000000000}*f_curr[1] - f_curr[2] - f_curr[4] - T{2.00000000000000}*f_curr[5] - f_curr[6] - T{2.00000000000000}*f_curr[8] - f_curr[9])/rho_w; + V rho = rho_w; + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +template +static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<-1,0,0>) { + V u_w = (-rho_w + f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/rho_w; + V rho = rho_w; + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +}; diff --git a/src/LLBM/equilibrium_velocity_wall.h b/src/LLBM/equilibrium_velocity_wall.h new file mode 100644 index 0000000..2d68e34 --- /dev/null +++ b/src/LLBM/equilibrium_velocity_wall.h @@ -0,0 +1,144 @@ +#pragma once + +#include "wall.h" +#include "concepts.h" + +struct EquilibriumVelocityWallO { + +template +static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<1,0,0>) { + V rho; + rho = -(f_curr[0] + T{2.00000000000000}*f_curr[11] + f_curr[12] + f_curr[14] + T{2.00000000000000}*f_curr[15] + f_curr[16] + f_curr[18] + T{2.00000000000000}*f_curr[1] + f_curr[2] + f_curr[4] + T{2.00000000000000}*f_curr[5] + f_curr[6] + T{2.00000000000000}*f_curr[8] + f_curr[9])/(u_w + T{-1.00000000000000}); + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +template +static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<-1,0,0>) { + V rho; + rho = (f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/(u_w + T{1.00000000000000}); + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +}; diff --git a/src/LLBM/initialize.h b/src/LLBM/initialize.h new file mode 100644 index 0000000..b361217 --- /dev/null +++ b/src/LLBM/initialize.h @@ -0,0 +1,53 @@ +#pragma once + +#include "concepts.h" + +struct InitializeO { + +template +static void apply(V f_curr[19], V f_next[19]) { + f_next[0] = T{0.0277777777777778}; + f_next[1] = T{0.0277777777777778}; + f_next[2] = T{0.0555555555555556}; + f_next[3] = T{0.0277777777777778}; + f_next[4] = T{0.0277777777777778}; + f_next[5] = T{0.0277777777777778}; + f_next[6] = T{0.0555555555555556}; + f_next[7] = T{0.0277777777777778}; + f_next[8] = T{0.0555555555555556}; + f_next[9] = T{0.333333333333333}; + f_next[10] = T{0.0555555555555556}; + f_next[11] = T{0.0277777777777778}; + f_next[12] = T{0.0555555555555556}; + f_next[13] = T{0.0277777777777778}; + f_next[14] = T{0.0277777777777778}; + f_next[15] = T{0.0277777777777778}; + f_next[16] = T{0.0555555555555556}; + f_next[17] = T{0.0277777777777778}; + f_next[18] = T{0.0277777777777778}; +} + +template +static void apply(V f_curr[19], V f_next[19], T scale) { + f_next[0] = T{scale * 0.0277777777777778}; + f_next[1] = T{scale * 0.0277777777777778}; + f_next[2] = T{scale * 0.0555555555555556}; + f_next[3] = T{scale * 0.0277777777777778}; + f_next[4] = T{scale * 0.0277777777777778}; + f_next[5] = T{scale * 0.0277777777777778}; + f_next[6] = T{scale * 0.0555555555555556}; + f_next[7] = T{scale * 0.0277777777777778}; + f_next[8] = T{scale * 0.0555555555555556}; + f_next[9] = T{scale * 0.333333333333333}; + f_next[10] = T{scale * 0.0555555555555556}; + f_next[11] = T{scale * 0.0277777777777778}; + f_next[12] = T{scale * 0.0555555555555556}; + f_next[13] = T{scale * 0.0277777777777778}; + f_next[14] = T{scale * 0.0277777777777778}; + f_next[15] = T{scale * 0.0277777777777778}; + f_next[16] = T{scale * 0.0555555555555556}; + f_next[17] = T{scale * 0.0277777777777778}; + f_next[18] = T{scale * 0.0277777777777778}; +} + +}; diff --git a/src/LLBM/smagorinsky_collide.h b/src/LLBM/smagorinsky_collide.h new file mode 100644 index 0000000..1a90598 --- /dev/null +++ b/src/LLBM/smagorinsky_collide.h @@ -0,0 +1,129 @@ +#pragma once + +#include "concepts.h" + +struct SmagorinskyBgkCollideO { + +template +static void apply(V f_curr[19], V f_next[19], T tau, T smagorinsky) { + V rho; + V u[3]; + V x0 = f_curr[10] + f_curr[13] + f_curr[17]; + V x1 = f_curr[14] + f_curr[5] + f_curr[6]; + V x2 = f_curr[1] + f_curr[2] + f_curr[4]; + V x3 = x0 + x1 + x2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9]; + V x4 = -f_curr[11] + f_curr[7]; + V x5 = -f_curr[15] + f_curr[3]; + V x6 = T{1}/x3; + V x7 = f_curr[0] - f_curr[18]; + V x8 = T{2.00000000000000}*tau + T{-1.00000000000000}; + V x9 = T{72.0000000000000}*f_curr[5]; + V x26 = T{72.0000000000000}*f_curr[13]; + V x33 = T{72.0000000000000}*f_curr[11]; + V x44 = T{72.0000000000000}*f_curr[1]; + V x50 = T{72.0000000000000}*f_curr[17]; + V x56 = T{72.0000000000000}*f_curr[15]; + V x64 = T{72.0000000000000}*f_curr[4]; + V x68 = T{72.0000000000000}*f_curr[14]; + V x72 = T{72.0000000000000}*f_curr[18]; + rho = x3; + V x78 = T{2.00000000000000}*rho; + u[0] = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]); + V x12 = -u[0]; + V x14 = T{6.00000000000000}*u[0]; + V x17 = u[0]*u[0]; + V x18 = T{3.00000000000000}*x17; + V x27 = -x14; + V x38 = T{2.00000000000000} - x18; + V x79 = T{6.00000000000000}*x17; + u[1] = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]); + V x10 = T{6.00000000000000}*u[1]; + V x11 = -x10; + V x13 = x12 + u[1]; + V x19 = u[1]*u[1]; + V x20 = T{3.00000000000000}*x19; + V x21 = x18 + x20 + T{-2.00000000000000}; + V x28 = -u[1]; + V x29 = x28 + u[0]; + V x34 = u[0] + u[1]; + V x35 = T{9.00000000000000}*(x34*x34); + V x37 = -x20; + V x39 = x37 + x38; + V x85 = T{6.00000000000000}*x19; + u[2] = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]); + V x15 = u[2]*u[2]; + V x16 = T{3.00000000000000}*x15; + V x22 = x16 + x21; + V x23 = x14 + x22; + V x24 = rho*(x11 + x23 - T{9.00000000000000}*x13*x13); + V x25 = x24 + x9; + V x30 = x10 + x22; + V x31 = rho*(x27 + x30 - T{9.00000000000000}*x29*x29); + V x32 = x26 + x31; + V x36 = rho*(x14 + x30 - x35); + V x40 = -x16; + V x41 = x10 + x40; + V x42 = rho*(x14 + x35 + x39 + x41) - T{72.0000000000000}*f_curr[7]; + V x43 = -x33 - x36 + x42; + V x45 = T{6.00000000000000}*u[2]; + V x46 = -x45; + V x47 = x12 + u[2]; + V x48 = rho*(x23 + x46 - T{9.00000000000000}*x47*x47); + V x49 = x44 + x48; + V x51 = -u[2]; + V x52 = x51 + u[0]; + V x53 = x22 + x45; + V x54 = rho*(x27 + x53 - T{9.00000000000000}*x52*x52); + V x55 = x50 + x54; + V x57 = u[0] + u[2]; + V x58 = T{9.00000000000000}*(x57*x57); + V x59 = rho*(x14 + x53 - x58); + V x60 = x39 + x45; + V x61 = x14 + x40; + V x62 = rho*(x58 + x60 + x61) - T{72.0000000000000}*f_curr[3]; + V x63 = -x56 - x59 + x62; + V x65 = x28 + u[2]; + V x66 = rho*(x30 + x46 - T{9.00000000000000}*x65*x65); + V x67 = x64 + x66; + V x69 = x51 + u[1]; + V x70 = rho*(x11 + x53 - T{9.00000000000000}*x69*x69); + V x71 = x68 + x70; + V x73 = u[1] + u[2]; + V x74 = T{9.00000000000000}*(x73*x73); + V x75 = rho*(x30 + x45 - x74); + V x76 = rho*(x41 + x60 + x74) - T{72.0000000000000}*f_curr[0]; + V x77 = -x72 - x75 + x76; + V x80 = x16 + T{-2.00000000000000}; + V x81 = x14 + x20 - x79 + x80; + V x82 = x37 + x61 + x79 + T{2.00000000000000}; + V x83 = -x44 - x48 - x50 - x54 + x63; + V x84 = -x24 - x26 - x31 + x43 - x9; + V x86 = x10 + x18 + x80 - x85; + V x87 = x38 + x41 + x85; + V x88 = -x64 - x66 - x68 - x70 + x77; + V x89 = T{6.00000000000000}*x15; + V x90 = x21 + x45 - x89; + V x91 = x60 + x89; + V x92 = T{1}/(T{20.0000000000000}*tau + T{2.00000000000000} + T{5.04537849152229}*simd::sqrt((smagorinsky*smagorinsky)*simd::sqrt((x25 + x32 + x43)*(x25 + x32 + x43) + (x49 + x55 + x63)*(x49 + x55 + x63) + (x67 + x71 + x77)*(x67 + x71 + x77) + T{0.500000000000000}*((-x78*x81 + x78*x82 + x83 + x84 - T{72}*f_curr[10] - T{72}*f_curr[8])*(-x78*x81 + x78*x82 + x83 + x84 - T{72}*f_curr[10] - T{72}*f_curr[8])) + T{0.500000000000000}*((-x78*x86 + x78*x87 + x84 + x88 - T{72}*f_curr[12] - T{72}*f_curr[6])*(-x78*x86 + x78*x87 + x84 + x88 - T{72}*f_curr[12] - T{72}*f_curr[6])) + T{0.500000000000000}*((-x78*x90 + x78*x91 + x83 + x88 - T{72}*f_curr[16] - T{72}*f_curr[2])*(-x78*x90 + x78*x91 + x83 + x88 - T{72}*f_curr[16] - T{72}*f_curr[2]))) + T{0.157134840263677}*(x8*x8))); + f_next[0] = T{0.333333333333333}*x76*x92 + f_curr[0]; + f_next[1] = -T{0.333333333333333}*x49*x92 + f_curr[1]; + f_next[2] = T{0.666666666666667}*x92*(rho*x91 - T{36.0000000000000}*f_curr[2]) + f_curr[2]; + f_next[3] = T{0.333333333333333}*x62*x92 + f_curr[3]; + f_next[4] = -T{0.333333333333333}*x67*x92 + f_curr[4]; + f_next[5] = -T{0.333333333333333}*x25*x92 + f_curr[5]; + f_next[6] = T{0.666666666666667}*x92*(rho*x87 - T{36.0000000000000}*f_curr[6]) + f_curr[6]; + f_next[7] = T{0.333333333333333}*x42*x92 + f_curr[7]; + f_next[8] = -T{0.666666666666667}*x92*(rho*x81 + T{36.0000000000000}*f_curr[8]) + f_curr[8]; + f_next[9] = -T{4.00000000000000}*x92*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9]; + f_next[10] = T{0.666666666666667}*x92*(rho*x82 - T{36.0000000000000}*f_curr[10]) + f_curr[10]; + f_next[11] = -T{0.333333333333333}*x92*(x33 + x36) + f_curr[11]; + f_next[12] = -T{0.666666666666667}*x92*(rho*x86 + T{36.0000000000000}*f_curr[12]) + f_curr[12]; + f_next[13] = -T{0.333333333333333}*x32*x92 + f_curr[13]; + f_next[14] = -T{0.333333333333333}*x71*x92 + f_curr[14]; + f_next[15] = -T{0.333333333333333}*x92*(x56 + x59) + f_curr[15]; + f_next[16] = -T{0.666666666666667}*x92*(rho*x90 + T{36.0000000000000}*f_curr[16]) + f_curr[16]; + f_next[17] = -T{0.333333333333333}*x55*x92 + f_curr[17]; + f_next[18] = -T{0.333333333333333}*x92*(x72 + x75) + f_curr[18]; +} + +}; diff --git a/src/LLBM/wall.h b/src/LLBM/wall.h new file mode 100644 index 0000000..26534e4 --- /dev/null +++ b/src/LLBM/wall.h @@ -0,0 +1,4 @@ +#pragma once + +template +struct WallNormal { }; diff --git a/src/concepts.h b/src/concepts.h new file mode 100644 index 0000000..61f7cbf --- /dev/null +++ b/src/concepts.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace concepts { + +template +concept Arithmetic = requires(V a, V b) { + a + b; + a += b; + + a - b; + a -= b; + + a * b; + a *= b; + + a / b; + a /= b; +}; + +} diff --git a/src/cuboid.h b/src/cuboid.h new file mode 100644 index 0000000..c4ff803 --- /dev/null +++ b/src/cuboid.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +class Cuboid { +private: + const std::array _size; + const std::size_t _volume; + +public: + Cuboid(std::size_t nX, std::size_t nY, std::size_t nZ): + _size{nX, nY, nZ}, + _volume(nX*nY*nZ) { } + + std::size_t volume() const { + return _volume; + } + + std::size_t index(unsigned iX, unsigned iY, unsigned iZ) const { + return iX*_size[1]*_size[2] + iY*_size[2] + iZ; + } + + int operator[](unsigned i) const { + return _size[i]; + } + + template + void traverse(F f) { + for (int iX = 0; iX < _size[0]; ++iX) { + for (int iY = 0; iY < _size[1]; ++iY) { + for (int iZ = 0; iZ < _size[2]; ++iZ) { + f(iX, iY, iZ, index(iX,iY,iZ)); + } + } + } + } +}; diff --git a/src/export.h b/src/export.h new file mode 100644 index 0000000..a3d38ec --- /dev/null +++ b/src/export.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include + +namespace export_format { + +template +void vtk(Cuboid c, std::span rho, std::span u, std::string path) { + std::ofstream fout; + fout.open(path.c_str()); + + fout << "# vtk DataFile Version 3.0\n"; + fout << "lbm_output\n"; + fout << "ASCII\n"; + fout << "DATASET RECTILINEAR_GRID\n"; + fout << "DIMENSIONS " << c[0] << " " << c[1] << " " << c[2] << "\n"; + + fout << "X_COORDINATES " << c[0] << " float\n"; + for (std::size_t iX=0; iX < c[0]; ++iX) { + fout << iX << " "; + } + + fout << "\nY_COORDINATES " << c[1] << " float\n"; + for (std::size_t iY=0; iY < c[1]; ++iY) { + fout << iY << " "; + } + + fout << "\nZ_COORDINATES " << c[2] << " float\n"; + for (std::size_t iZ=0; iZ < c[2]; ++iZ) { + fout << iZ << " "; + } + fout << "\nPOINT_DATA " << c.volume() << "\n"; + + fout << "VECTORS velocity float\n"; + for (std::size_t iZ=0; iZ < c[2]; ++iZ) { + for (std::size_t iY=0; iY < c[1]; ++iY) { + for (std::size_t iX=0; iX < c[0]; ++iX) { + const std::size_t iCell = c.index(iX,iY,iZ); + fout << u[iCell] << " " << u[c.volume() + iCell] << " " << u[2*c.volume() + iCell] << "\n"; + } + } + } + + fout << "SCALARS density float 1\n"; + fout << "LOOKUP_TABLE default\n"; + for (std::size_t iZ=0; iZ < c[2]; ++iZ) { + for (std::size_t iY=0; iY < c[1]; ++iY) { + for (std::size_t iX=0; iX < c[0]; ++iX) { + const std::size_t iCell = c.index(iX,iY,iZ); + fout << rho[iCell] << "\n"; + } + } + } + + fout.close(); +} + +} diff --git a/src/lattice.h b/src/lattice.h new file mode 100644 index 0000000..bbebab4 --- /dev/null +++ b/src/lattice.h @@ -0,0 +1,267 @@ +#pragma once + +#include +#include + +#ifdef ENABLE_AVX512 +#include "simd/512.h" +#else +#include "simd/256.h" +#endif + +#include + +#include "cuboid.h" +#include "population.h" +#include "propagation.h" +#include "operator.h" +#include "export.h" + +#include "LLBM/initialize.h" +#include "LLBM/collect_moments.h" + +template +class LatticeMask { +private: + const std::size_t _size; + + std::unique_ptr::storage_t[]> _simd_mask; + std::unique_ptr _base_mask; + +public: + LatticeMask(std::size_t nCells): + _size(((nCells + simd::Pack::size - 1) / simd::Mask::storage_size + 1) * simd::Mask::storage_size), + _simd_mask(new typename simd::Mask::storage_t[_size / simd::Mask::storage_size] { }), + _base_mask(new bool[_size] { }) + { } + + void set(std::size_t iCell, bool state) { + _base_mask[iCell] = state; + } + + bool get(std::size_t iCell) const { + return _base_mask[iCell]; + } + + void serialize() { + for (std::size_t i=0; i < _size / simd::Mask::storage_size; ++i) { + _simd_mask[i] = simd::Mask::encode(_base_mask.get() + i*simd::Mask::storage_size); + } + } + + typename simd::Mask::storage_t* data() { + return _simd_mask.get(); + } + +}; + +template +class Lattice { +private: + using value_t = typename Buffer::value_t; + using pack_t = simd::Pack; + using mask_t = simd::Mask; + + Cuboid _cuboid; + Buffer _population; + + template + void apply(COP cop, + std::size_t iCell, + pack_t f_curr[population::q], + pack_t f_next[population::q]) { + if (mask_t m = {cop.mask.data(), iCell}) { + cop(f_curr, f_next); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::maskstore(get(iPop, stage::post_collision()) + iCell, m, f_next[iPop]); + } + } + } + +public: + Lattice(Cuboid cuboid): + _cuboid(cuboid), + _population(cuboid) + { + apply(); + } + + template + value_t* get(unsigned iPop, STAGE stage) { + return _population.get(iPop, stage); + } + + std::size_t volume() const { + return _cuboid.volume(); + } + + void stream() { + _population.stream(); + } + + template + requires concepts::Operator + void apply(std::size_t iCell, ARGS&&... args) { + value_t f_curr[population::q]; + value_t f_next[population::q]; + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = get(iPop, stage::pre_collision())[iCell]; + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + get(iPop, stage::post_collision())[iCell] = f_next[iPop]; + } + } + + template + requires concepts::Operator + void apply() { + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t i=0; i < volume() / pack_t::size; ++i) { + const std::size_t iCell = pack_t::size * i; + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + F::apply(f_curr, f_next); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::store(get(iPop, stage::post_collision()) + iCell, f_next[iPop]); + } + } + + for (std::size_t iCell = pack_t::size * (volume() / pack_t::size); + iCell < volume(); + ++iCell) { + apply(iCell); + } + } + + template + requires concepts::Operator + void apply(LatticeMask& mask, ARGS&&... args) { + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) { + mask_t m(mask.data(), iCell); + if (m) { + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::maskstore(get(iPop, stage::post_collision()) + iCell, m, f_next[iPop]); + } + } + } + } + + template + void apply(COPs&&... cops) { + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) { + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + (apply(cops, iCell, f_curr, f_next), ...); + } + } + + template + void apply(std::vector& cells, ARGS&&... args) { + #ifdef SIMD_CELL_LIST + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t i=0; i < cells.size() / pack_t::size; ++i) { + const std::size_t iIdx = pack_t::size * i; + pack_t::index_t* locs = cells.data() + iIdx; + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()), locs); + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::store(get(iPop, stage::post_collision()), f_next[iPop], locs); + } + } + + for (std::size_t i = pack_t::size * (cells.size() / pack_t::size); + i < cells.size(); + ++i) { + apply(cells[i], std::forward(args)...); + } + #else + value_t f_curr[population::q]; + value_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t i=0; i < cells.size(); ++i) { + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = get(iPop, stage::pre_collision())[cells[i]]; + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + get(iPop, stage::post_collision())[cells[i]] = f_next[iPop]; + } + } + #endif + } + + template + requires concepts::Functor + void observe(LatticeMask& mask, std::span buffer) { + pack_t f[population::q]; + pack_t output[F::size]; + #pragma omp parallel for private(f,output) schedule(static) + for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) { + mask_t m(mask.data(), iCell); + if (m) { + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + CollectMomentsF::observe(f, output); + + for (unsigned i=0; i < F::size; ++i) { + simd::maskstore(buffer.data() + i*volume() + iCell, m, output[i]); + } + } + } + } + + void write_momenta(LatticeMask& mask, std::string path) { + std::vector buffer(CollectMomentsF::size*volume()); + observe(mask, buffer); + export_format::vtk(_cuboid, + { buffer.data(), 1*volume() }, + { buffer.data() + volume(), 3*volume() }, + path); + } +}; diff --git a/src/operator.h b/src/operator.h new file mode 100644 index 0000000..b4830eb --- /dev/null +++ b/src/operator.h @@ -0,0 +1,41 @@ +#pragma once + +#include + +#include "concepts.h" + +namespace concepts { + +template +concept Operator = Arithmetic + && requires(F op, T* f_curr, T* f_next, ARGS... args) { + F::apply(f_curr, f_next, args...); +}; + +template +concept Functor = Arithmetic + && requires(F f, T* f_curr, T output[F::size]) { + F::observe(f_curr, output); +}; + +} + +template +struct Operator { + M& mask; + std::tuple args; + + Operator(F, M& m, ARGS&&... args): + mask(m), + args(std::forward(args)...) { } + + template + requires concepts::Operator + void operator()(U f_curr[], U f_next[]) { + std::apply([](auto... args) { F::apply(args...); }, + std::tuple_cat(std::make_tuple(f_curr, f_next), args)); + } +}; + +template +Operator(F, M&, ARGS&&... args) -> Operator; diff --git a/src/pattern/all.h b/src/pattern/all.h new file mode 100644 index 0000000..6496e02 --- /dev/null +++ b/src/pattern/all.h @@ -0,0 +1,5 @@ +#pragma once + +#include "pattern/ps.h" +#include "pattern/sss.h" +#include "pattern/none.h" diff --git a/src/pattern/none.h b/src/pattern/none.h new file mode 100644 index 0000000..8abc419 --- /dev/null +++ b/src/pattern/none.h @@ -0,0 +1,48 @@ +#pragma once + +#include "population.h" +#include "propagation.h" + +#include + +namespace pattern { + +template +class NONE { +private: + Cuboid _cuboid; + + T* _base; + T* _f[population::q]; + +public: + using value_t = T; + + NONE(Cuboid cuboid): + _cuboid(cuboid) + { + const std::size_t pagesize = sysconf(_SC_PAGESIZE); + const std::size_t padding = _cuboid.index(1,1,1); + _base = static_cast( + std::aligned_alloc(pagesize, population::q * (_cuboid.volume() + 2*padding) * sizeof(T))); + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _f[iPop] = _base + iPop * (_cuboid.volume() + 2*padding) + padding; + } + } + + ~NONE() { + delete [] _base; + } + + T* get(unsigned iPop, stage::pre_collision) { + return _f[iPop]; + } + + T* get(unsigned iPop, stage::post_collision) { + return _f[iPop]; + } + + void stream() { } +}; + +} diff --git a/src/pattern/ps.h b/src/pattern/ps.h new file mode 100644 index 0000000..667b09f --- /dev/null +++ b/src/pattern/ps.h @@ -0,0 +1,123 @@ +#pragma once + +#include "population.h" +#include "propagation.h" + +#include +#include +#include + +const int PROT_RW = PROT_READ | PROT_WRITE; + +constexpr std::uint32_t log2(std::uint32_t x) { + return 31 - __builtin_clz(x); +} + +namespace pattern { + +template +class PS { +private: + Cuboid _cuboid; + std::ptrdiff_t _volume; + + std::string _shm_path; + int _shm_name; + int _shm_file; + + std::uint8_t* _base_buffer; + std::uint8_t* _buffer[population::q]; + + T* _base[population::q]; + T* _f[population::q]; + +public: + using value_t = T; + + PS(Cuboid cuboid): + _cuboid(cuboid) + { + const std::size_t page_size = sysconf(_SC_PAGESIZE); + const std::size_t line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + const std::size_t size = ((_cuboid.volume() * sizeof(T) - 1) / page_size + 1) * page_size; + _volume = size / sizeof(T); + + if (size % page_size != 0) { + throw std::invalid_argument("Array size must be multiple of PAGE_SIZE"); + } + + _shm_path = "/lbm_XXXXXX"; + _shm_name = mkstemp(const_cast(_shm_path.data())); + if (_shm_name != -1) { + throw std::runtime_error("Could not generate unique shared memory object name"); + } + // Open shared memory object as physical lattice memory + _shm_file = shm_open(_shm_path.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (_shm_file == -1) { + throw std::runtime_error("Failed to create shared memory object"); + } + // Resize to fit lattice populations + if (ftruncate(_shm_file, population::q * size) == -1) { + throw std::runtime_error("Failed to resize shared memory object"); + } + + // Allocate virtual address space for q times two consecutive lattices + _base_buffer = static_cast( + mmap(NULL, population::q * 2 * size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _buffer[iPop] = _base_buffer + iPop * 2 * size; + // Map single physical lattice into virtual address space + mmap(_buffer[iPop] , size, PROT_RW, MAP_SHARED | MAP_FIXED, _shm_file, iPop * size); + mmap(_buffer[iPop] + size, size, PROT_RW, MAP_SHARED | MAP_FIXED, _shm_file, iPop * size); + + // Store base pointer for reference + _base[iPop] = reinterpret_cast(_buffer[iPop]); + // Initialize shiftable f pointer to be used for lattice access + _f[iPop] = _base[iPop]; + } + + // Pre-shift to reduce cache and TLB conflict misses + // due to alignment of start address and page-boundaries every `page_size/sizeof(T)`-steps + for (unsigned iPop=0; iPop < population::q; ++iPop) { + std::ptrdiff_t shift = iPop * (1 << log2(line_size)) + + iPop * (1 << log2(page_size)); + shift /= sizeof(T); + if (shift < _volume-1) { + _f[iPop] += shift; + } + } + } + + ~PS() { + munmap(_base_buffer, population::q * 2 * _volume * sizeof(T)); + shm_unlink(_shm_path.c_str()); + close(_shm_name); + unlink(_shm_path.c_str()); + } + + T* get(unsigned iPop, stage::pre_collision) { + return _f[iPop]; + } + + T* get(unsigned iPop, stage::post_collision) { + return _f[iPop]; + } + + void stream() { + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _f[iPop] -= population::offset(_cuboid, iPop); + } + + for (unsigned iPop=0; iPop < population::q; ++iPop) { + if (_f[iPop] - _base[iPop] >= _volume) { + _f[iPop] -= _volume; + } else if (_f[iPop] < _base[iPop]) { + _f[iPop] += _volume; + } + } + } + +}; + +} diff --git a/src/pattern/sss.h b/src/pattern/sss.h new file mode 100644 index 0000000..1f1985d --- /dev/null +++ b/src/pattern/sss.h @@ -0,0 +1,63 @@ +#pragma once + +#include "population.h" +#include "propagation.h" + +#include "cuboid.h" +#include "concepts.h" +#include "population.h" + +#include + +namespace pattern { + +template +class SSS { +private: + Cuboid _cuboid; + + T* _base; + T* _buffer[population::q]; + T* _f[population::q]; + +public: + using value_t = T; + + SSS(Cuboid cuboid): + _cuboid(cuboid) + { + const std::size_t pagesize = sysconf(_SC_PAGESIZE); + const std::ptrdiff_t padding = population::max_offset(_cuboid); + _base = static_cast( + std::aligned_alloc(pagesize, population::q * (_cuboid.volume() + 2*padding) * sizeof(T))); + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _buffer[iPop] = _base + iPop * (_cuboid.volume() + 2*padding); + _f[iPop] = _buffer[iPop] + padding; + } + } + + ~SSS() { + delete [] _base; + } + + T* get(unsigned iPop, stage::pre_collision) { + return _f[iPop]; + } + + T* get(unsigned iPop, stage::post_collision) { + return _f[population::opposite(iPop)]; + } + + void stream() { + T* f_old[population::q]; + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_old[iPop] = _f[iPop]; + } + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _f[iPop] = f_old[population::opposite(iPop)] - population::offset(_cuboid, iPop); + } + } + +}; + +} diff --git a/src/population.h b/src/population.h new file mode 100644 index 0000000..14c9b77 --- /dev/null +++ b/src/population.h @@ -0,0 +1,41 @@ +#pragma once + +namespace population { + +constexpr unsigned d = 3; +constexpr unsigned q = 19; + +constexpr int velocity[q][d] = { + { 0, 1, 1}, {-1, 0, 1}, { 0, 0, 1}, { 1, 0, 1}, { 0,-1, 1}, + {-1, 1, 0}, { 0, 1, 0}, { 1, 1, 0}, {-1, 0, 0}, { 0, 0, 0}, { 1, 0, 0}, {-1,-1, 0}, { 0,-1, 0}, { 1,-1, 0}, + { 0, 1,-1}, {-1, 0,-1}, { 0, 0,-1}, { 1, 0,-1}, { 0,-1,-1} +}; + +unsigned opposite(unsigned iPop) { + return q - 1 - iPop; +} + +std::ptrdiff_t offset(const Cuboid& c, int iX, int iY, int iZ) { + return iX*c[1]*c[2] + iY*c[2] + iZ; +} + +std::ptrdiff_t offset(const Cuboid& c, unsigned iPop) { + return offset(c, velocity[iPop][0], velocity[iPop][1], velocity[iPop][2]); +} + +std::ptrdiff_t max_offset(const Cuboid& c) { + std::ptrdiff_t padding = 0; + for (unsigned iPop=0; iPop < q; ++iPop) { + padding = std::max(padding, std::abs(offset(c, iPop))); + } + return padding; +} + +} + +namespace stage { + +struct pre_collision { }; +struct post_collision { }; + +} diff --git a/src/propagation.h b/src/propagation.h new file mode 100644 index 0000000..0bcb435 --- /dev/null +++ b/src/propagation.h @@ -0,0 +1,28 @@ +#pragma once + +#include "cuboid.h" +#include "concepts.h" + +namespace concepts { + +template +concept CuboidConstructible = std::constructible_from; + +template +concept PopulationAccess = Arithmetic + && requires(BUFFER& b) { + { b.get(unsigned{}, stage::pre_collision()) } -> std::same_as; + { b.get(unsigned{}, stage::post_collision()) } -> std::same_as; +}; + +template +concept Propagatable = requires(BUFFER& b) { + b.stream(); +}; + +template +concept PropagatablePopulationBuffer = CuboidConstructible + && PopulationAccess + && Propagatable; + +} diff --git a/src/simd/256.h b/src/simd/256.h new file mode 100644 index 0000000..a3f419d --- /dev/null +++ b/src/simd/256.h @@ -0,0 +1,379 @@ +#pragma once + +#include + +#include +#include + +namespace simd { + +template +class Mask; + +template <> +class Mask { +private: + __m256i _reg; + +public: + using storage_t = std::uint64_t; + static constexpr unsigned storage_size = 1; + + static constexpr storage_t true_v = 1l << 63; + static constexpr storage_t false_v = 0l; + + static storage_t encode(bool value) { + return value ? true_v : false_v; + } + + static storage_t encode(bool* value) { + return encode(*value); + } + + Mask(bool a, bool b, bool c, bool d): + _reg(_mm256_set_epi64x(encode(d),encode(c),encode(b),encode(a))) { } + + Mask(std::uint64_t a, std::uint64_t b, std::uint64_t c, std::uint64_t d): + _reg(_mm256_set_epi64x(d,c,b,a)) { } + + Mask(std::uint64_t* ptr): + _reg(_mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr))) { } + + Mask(storage_t* ptr, std::size_t iCell): + Mask(ptr + iCell) { } + + Mask(__m256i reg): + _reg(reg) { } + + operator __m256i() { + return _reg; + } + + __m256i neg() const { + return _mm256_sub_epi64(_mm256_set1_epi64x(true_v), _reg); + } + + operator bool() const { + const std::uint64_t* values = reinterpret_cast(&_reg); + return values[0] == true_v + || values[1] == true_v + || values[2] == true_v + || values[3] == true_v; + } +}; + +template <> +class Mask { +private: + __m256i _reg; + +public: + using storage_t = std::uint32_t; + static constexpr unsigned storage_size = 1; + + static constexpr storage_t true_v = 1 << 31; + static constexpr storage_t false_v = 0; + + static storage_t encode(bool value) { + return value ? true_v : false_v; + } + + static storage_t encode(bool* value) { + return encode(*value); + } + + Mask(storage_t* ptr): + _reg(_mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr))) { } + + Mask(storage_t* ptr, std::size_t iCell): + Mask(ptr + iCell) { } + + Mask(__m256i reg): + _reg(reg) { } + + operator __m256i() { + return _reg; + } + + __m256i neg() const { + return _mm256_sub_epi32(_mm256_set1_epi32(true_v), _reg); + } + + operator bool() const { + const std::uint32_t* values = reinterpret_cast(&_reg); + return values[0] == true_v + || values[1] == true_v + || values[2] == true_v + || values[3] == true_v + || values[4] == true_v + || values[5] == true_v + || values[6] == true_v + || values[7] == true_v; + } +}; + + +template +class Pack; + +template <> +class Pack { +private: + __m256d _reg; + +public: + using mask_t = Mask; + using index_t = std::uint32_t; + + static constexpr std::size_t size = 4; + + Pack() = default; + + Pack(__m256d reg): + _reg(reg) { } + + Pack(double val): + Pack(_mm256_set1_pd(val)) { } + + Pack(double a, double b, double c, double d): + Pack(_mm256_set_pd(d,c,b,a)) { } + + Pack(double* ptr): + Pack(_mm256_loadu_pd(ptr)) { + _mm_prefetch(ptr + size, _MM_HINT_T2); + } + + Pack(double* ptr, index_t* idx): + Pack(_mm256_i32gather_pd(ptr, _mm_loadu_si128(reinterpret_cast<__m128i*>(idx)), sizeof(double))) { } + + operator __m256d() { + return _reg; + } + + Pack operator+(Pack rhs) { + return Pack(_mm256_add_pd(_reg, rhs)); + } + + Pack& operator+=(Pack rhs) { + _reg = _mm256_add_pd(_reg, rhs); + return *this; + } + + Pack operator-(Pack rhs) { + return Pack(_mm256_sub_pd(_reg, rhs)); + } + + Pack& operator-=(Pack rhs) { + _reg = _mm256_sub_pd(_reg, rhs); + return *this; + } + + Pack operator*(Pack rhs) { + return Pack(_mm256_mul_pd(_reg, rhs)); + } + + Pack& operator*=(Pack rhs) { + _reg = _mm256_mul_pd(_reg, rhs); + return *this; + } + + Pack operator/(Pack rhs) { + return Pack(_mm256_div_pd(_reg, rhs)); + } + + Pack& operator/=(Pack rhs) { + _reg = _mm256_div_pd(_reg, rhs); + return *this; + } + + Pack operator-() { + return *this * Pack(-1.); + } + + __m256d sqrt() { + return _mm256_sqrt_pd(_reg); + } +}; + +template <> +class Pack { +private: + __m256 _reg; + +public: + using mask_t = Mask; + using index_t = std::uint32_t; + + static constexpr std::size_t size = 8; + + Pack() = default; + + Pack(__m256 reg): + _reg(reg) { } + + Pack(float val): + Pack(_mm256_set1_ps(val)) { } + + Pack(float a, float b, float c, float d, float e, float f, float g, float h): + Pack(_mm256_set_ps(h,g,f,e,d,c,b,a)) { } + + Pack(float* ptr): + Pack(_mm256_loadu_ps(ptr)) { + _mm_prefetch(ptr + size, _MM_HINT_T2); + } + + Pack(float* ptr, index_t* idx): + Pack(_mm256_i32gather_ps(ptr, _mm256_loadu_si256(reinterpret_cast<__m256i*>(idx)), sizeof(float))) { } + + operator __m256() { + return _reg; + } + + Pack operator+(Pack rhs) { + return Pack(_mm256_add_ps(_reg, rhs)); + } + + Pack& operator+=(Pack rhs) { + _reg = _mm256_add_ps(_reg, rhs); + return *this; + } + + Pack operator-(Pack rhs) { + return Pack(_mm256_sub_ps(_reg, rhs)); + } + + Pack& operator-=(Pack rhs) { + _reg = _mm256_sub_ps(_reg, rhs); + return *this; + } + + Pack operator*(Pack rhs) { + return Pack(_mm256_mul_ps(_reg, rhs)); + } + + Pack& operator*=(Pack rhs) { + _reg = _mm256_mul_ps(_reg, rhs); + return *this; + } + + Pack operator/(Pack rhs) { + return Pack(_mm256_div_ps(_reg, rhs)); + } + + Pack& operator/=(Pack rhs) { + _reg = _mm256_div_ps(_reg, rhs); + return *this; + } + + Pack operator-() { + return *this * Pack(-1.); + } + + __m256 sqrt() { + return _mm256_sqrt_ps(_reg); + } +}; + + +template +Pack operator+(T lhs, Pack rhs) { + return Pack(lhs) + rhs; +} + +template +Pack operator+(Pack lhs, T rhs) { + return lhs + Pack(rhs); +} + +template +Pack operator-(T lhs, Pack