From a92271176a19e06611099c0eccc4e6a6887f4915 Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Mon, 17 May 2021 00:30:13 +0200 Subject: Extract public version of SweepLB --- src/LLBM/bounce_back.h | 30 +++ src/LLBM/bounce_back_moving_wall.h | 30 +++ src/LLBM/collect_moments.h | 25 +++ src/LLBM/collide.h | 89 ++++++++ src/LLBM/equilibrium_density_wall.h | 144 +++++++++++++ src/LLBM/equilibrium_velocity_wall.h | 144 +++++++++++++ src/LLBM/initialize.h | 53 +++++ src/LLBM/smagorinsky_collide.h | 129 ++++++++++++ src/LLBM/wall.h | 4 + src/concepts.h | 22 ++ src/cuboid.h | 37 ++++ src/export.h | 59 ++++++ src/lattice.h | 267 ++++++++++++++++++++++++ src/operator.h | 41 ++++ src/pattern/all.h | 5 + src/pattern/none.h | 48 +++++ src/pattern/ps.h | 123 ++++++++++++ src/pattern/sss.h | 63 ++++++ src/population.h | 41 ++++ src/propagation.h | 28 +++ src/simd/256.h | 379 +++++++++++++++++++++++++++++++++++ src/simd/512.h | 339 +++++++++++++++++++++++++++++++ 22 files changed, 2100 insertions(+) create mode 100644 src/LLBM/bounce_back.h create mode 100644 src/LLBM/bounce_back_moving_wall.h create mode 100644 src/LLBM/collect_moments.h create mode 100644 src/LLBM/collide.h create mode 100644 src/LLBM/equilibrium_density_wall.h create mode 100644 src/LLBM/equilibrium_velocity_wall.h create mode 100644 src/LLBM/initialize.h create mode 100644 src/LLBM/smagorinsky_collide.h create mode 100644 src/LLBM/wall.h create mode 100644 src/concepts.h create mode 100644 src/cuboid.h create mode 100644 src/export.h create mode 100644 src/lattice.h create mode 100644 src/operator.h create mode 100644 src/pattern/all.h create mode 100644 src/pattern/none.h create mode 100644 src/pattern/ps.h create mode 100644 src/pattern/sss.h create mode 100644 src/population.h create mode 100644 src/propagation.h create mode 100644 src/simd/256.h create mode 100644 src/simd/512.h (limited to 'src') diff --git a/src/LLBM/bounce_back.h b/src/LLBM/bounce_back.h new file mode 100644 index 0000000..b710ee4 --- /dev/null +++ b/src/LLBM/bounce_back.h @@ -0,0 +1,30 @@ +#pragma once + +#include "concepts.h" + +struct BounceBackO { + +template +static void apply(V f_curr[19], V f_next[19]) { + f_next[0] = f_curr[18]; + f_next[1] = f_curr[17]; + f_next[2] = f_curr[16]; + f_next[3] = f_curr[15]; + f_next[4] = f_curr[14]; + f_next[5] = f_curr[13]; + f_next[6] = f_curr[12]; + f_next[7] = f_curr[11]; + f_next[8] = f_curr[10]; + f_next[9] = f_curr[9]; + f_next[10] = f_curr[8]; + f_next[11] = f_curr[7]; + f_next[12] = f_curr[6]; + f_next[13] = f_curr[5]; + f_next[14] = f_curr[4]; + f_next[15] = f_curr[3]; + f_next[16] = f_curr[2]; + f_next[17] = f_curr[1]; + f_next[18] = f_curr[0]; +} + +}; diff --git a/src/LLBM/bounce_back_moving_wall.h b/src/LLBM/bounce_back_moving_wall.h new file mode 100644 index 0000000..c00a17e --- /dev/null +++ b/src/LLBM/bounce_back_moving_wall.h @@ -0,0 +1,30 @@ +#pragma once + +#include "concepts.h" + +struct BounceBackMovingWallO { + +template +static void apply(V f_curr[19], V f_next[19], T u[3]) { + f_next[0] = f_curr[18] + T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2]; + f_next[1] = f_curr[17] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2]; + f_next[2] = f_curr[16] + T{0.333333333333333}*u[2]; + f_next[3] = f_curr[15] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[2]; + f_next[4] = f_curr[14] - T{0.166666666666667}*u[1] + T{0.166666666666667}*u[2]; + f_next[5] = f_curr[13] - T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1]; + f_next[6] = f_curr[12] + T{0.333333333333333}*u[1]; + f_next[7] = f_curr[11] + T{0.166666666666667}*u[0] + T{0.166666666666667}*u[1]; + f_next[8] = f_curr[10] - T{0.333333333333333}*u[0]; + f_next[9] = f_curr[9]; + f_next[10] = f_curr[8] + T{0.333333333333333}*u[0]; + f_next[11] = f_curr[7] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1]; + f_next[12] = f_curr[6] - T{0.333333333333333}*u[1]; + f_next[13] = f_curr[5] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[1]; + f_next[14] = f_curr[4] + T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2]; + f_next[15] = f_curr[3] - T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2]; + f_next[16] = f_curr[2] - T{0.333333333333333}*u[2]; + f_next[17] = f_curr[1] + T{0.166666666666667}*u[0] - T{0.166666666666667}*u[2]; + f_next[18] = f_curr[0] - T{0.166666666666667}*u[1] - T{0.166666666666667}*u[2]; +} + +}; diff --git a/src/LLBM/collect_moments.h b/src/LLBM/collect_moments.h new file mode 100644 index 0000000..afd5d59 --- /dev/null +++ b/src/LLBM/collect_moments.h @@ -0,0 +1,25 @@ +#pragma once + +#include "concepts.h" + +struct CollectMomentsF { + +static constexpr unsigned size = 4; + +template +static void observe(V f_curr[19], V* momenta) { + V m0 = f_curr[10] + f_curr[13] + f_curr[17]; + V m1 = f_curr[14] + f_curr[5] + f_curr[6]; + V m2 = f_curr[1] + f_curr[2] + f_curr[4]; + V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9]; + V m4 = -f_curr[11] + f_curr[7]; + V m5 = -f_curr[15] + f_curr[3]; + V m6 = V{1}/m3; + V m7 = f_curr[0] - f_curr[18]; + momenta[0] = m3; + momenta[1] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]); + momenta[2] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]); + momenta[3] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]); +} + +}; diff --git a/src/LLBM/collide.h b/src/LLBM/collide.h new file mode 100644 index 0000000..c08aade --- /dev/null +++ b/src/LLBM/collide.h @@ -0,0 +1,89 @@ +#pragma once + +#include "concepts.h" + +struct BgkCollideO { + +template +static void apply(V f_curr[19], V f_next[19], T tau) { + V rho; + V u[3]; + V m0 = f_curr[10] + f_curr[13] + f_curr[17]; + V m1 = f_curr[14] + f_curr[5] + f_curr[6]; + V m2 = f_curr[1] + f_curr[2] + f_curr[4]; + V m3 = m0 + m1 + m2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9]; + V m4 = -f_curr[11] + f_curr[7]; + V m5 = -f_curr[15] + f_curr[3]; + V m6 = T{1}/m3; + V m7 = f_curr[0] - f_curr[18]; + rho = m3; + u[0] = m6*(m0 + m4 + m5 - f_curr[1] - f_curr[5] - f_curr[8]); + u[1] = m6*(m1 + m4 + m7 - f_curr[12] - f_curr[13] - f_curr[4]); + u[2] = m6*(m2 + m5 + m7 - f_curr[14] - f_curr[16] - f_curr[17]); + V x0 = T{1}/tau; + V x1 = T{0.0138888888888889}*x0; + V x2 = u[1] + u[2]; + V x3 = T{9.00000000000000}*(x2*x2); + V x4 = T{6.00000000000000}*u[2]; + V x5 = u[1]*u[1]; + V x6 = T{3.00000000000000}*x5; + V x7 = -x6; + V x8 = u[0]*u[0]; + V x9 = T{3.00000000000000}*x8; + V x10 = T{2.00000000000000} - x9; + V x11 = x10 + x7; + V x12 = x11 + x4; + V x13 = u[2]*u[2]; + V x14 = T{3.00000000000000}*x13; + V x15 = -x14; + V x16 = T{6.00000000000000}*u[1]; + V x17 = x15 + x16; + V x18 = T{6.00000000000000}*u[0]; + V x19 = -u[0]; + V x20 = x19 + u[2]; + V x21 = x14 + x6 + T{-2.00000000000000}; + V x22 = x21 + x9; + V x23 = x22 - x4; + V x24 = T{0.0277777777777778}*x0; + V x25 = T{6.00000000000000}*x13; + V x26 = u[0] + u[2]; + V x27 = T{9.00000000000000}*(x26*x26); + V x28 = x15 + x18; + V x29 = -u[1]; + V x30 = x29 + u[2]; + V x31 = x19 + u[1]; + V x32 = -x16; + V x33 = x18 + x22; + V x34 = T{6.00000000000000}*x5; + V x35 = u[0] + u[1]; + V x36 = T{9.00000000000000}*(x35*x35); + V x37 = T{6.00000000000000}*x8; + V x38 = x9 + T{-2.00000000000000}; + V x39 = x29 + u[0]; + V x40 = -x18 + x22; + V x41 = -u[2]; + V x42 = x41 + u[1]; + V x43 = x22 + x4; + V x44 = x41 + u[0]; + f_next[0] = x1*(rho*(x12 + x17 + x3) - T{72.0000000000000}*f_curr[0]) + f_curr[0]; + f_next[1] = -x1*(rho*(x18 + x23 - T{9.00000000000000}*x20*x20) + T{72.0000000000000}*f_curr[1]) + f_curr[1]; + f_next[2] = x24*(rho*(x12 + x25) - T{36.0000000000000}*f_curr[2]) + f_curr[2]; + f_next[3] = x1*(rho*(x12 + x27 + x28) - T{72.0000000000000}*f_curr[3]) + f_curr[3]; + f_next[4] = -x1*(rho*(x16 + x23 - T{9.00000000000000}*x30*x30) + T{72.0000000000000}*f_curr[4]) + f_curr[4]; + f_next[5] = -x1*(rho*(x32 + x33 - T{9.00000000000000}*x31*x31) + T{72.0000000000000}*f_curr[5]) + f_curr[5]; + f_next[6] = x24*(rho*(x10 + x17 + x34) - T{36.0000000000000}*f_curr[6]) + f_curr[6]; + f_next[7] = x1*(rho*(x11 + x17 + x18 + x36) - T{72.0000000000000}*f_curr[7]) + f_curr[7]; + f_next[8] = -x24*(rho*(x18 + x21 - x37) + T{36.0000000000000}*f_curr[8]) + f_curr[8]; + f_next[9] = -T{0.166666666666667}*x0*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9]; + f_next[10] = x24*(rho*(x28 + x37 + x7 + T{2.00000000000000}) - T{36.0000000000000}*f_curr[10]) + f_curr[10]; + f_next[11] = -x1*(rho*(x16 + x33 - x36) + T{72.0000000000000}*f_curr[11]) + f_curr[11]; + f_next[12] = -x24*(rho*(x14 + x16 - x34 + x38) + T{36.0000000000000}*f_curr[12]) + f_curr[12]; + f_next[13] = -x1*(rho*(x16 + x40 - T{9.00000000000000}*x39*x39) + T{72.0000000000000}*f_curr[13]) + f_curr[13]; + f_next[14] = -x1*(rho*(x32 + x43 - T{9.00000000000000}*x42*x42) + T{72.0000000000000}*f_curr[14]) + f_curr[14]; + f_next[15] = -x1*(rho*(-x27 + x33 + x4) + T{72.0000000000000}*f_curr[15]) + f_curr[15]; + f_next[16] = -x24*(rho*(-x25 + x38 + x4 + x6) + T{36.0000000000000}*f_curr[16]) + f_curr[16]; + f_next[17] = -x1*(rho*(x4 + x40 - T{9.00000000000000}*x44*x44) + T{72.0000000000000}*f_curr[17]) + f_curr[17]; + f_next[18] = -x1*(rho*(x16 - x3 + x43) + T{72.0000000000000}*f_curr[18]) + f_curr[18]; +} + +}; diff --git a/src/LLBM/equilibrium_density_wall.h b/src/LLBM/equilibrium_density_wall.h new file mode 100644 index 0000000..598d59d --- /dev/null +++ b/src/LLBM/equilibrium_density_wall.h @@ -0,0 +1,144 @@ +#pragma once + +#include "wall.h" +#include "concepts.h" + +struct EquilibriumDensityWallO { + +template +static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<1,0,0>) { + V u_w = (rho_w - f_curr[0] - T{2.00000000000000}*f_curr[11] - f_curr[12] - f_curr[14] - T{2.00000000000000}*f_curr[15] - f_curr[16] - f_curr[18] - T{2.00000000000000}*f_curr[1] - f_curr[2] - f_curr[4] - T{2.00000000000000}*f_curr[5] - f_curr[6] - T{2.00000000000000}*f_curr[8] - f_curr[9])/rho_w; + V rho = rho_w; + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +template +static void apply(V f_curr[19], V f_next[19], T rho_w, WallNormal<-1,0,0>) { + V u_w = (-rho_w + f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/rho_w; + V rho = rho_w; + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +}; diff --git a/src/LLBM/equilibrium_velocity_wall.h b/src/LLBM/equilibrium_velocity_wall.h new file mode 100644 index 0000000..2d68e34 --- /dev/null +++ b/src/LLBM/equilibrium_velocity_wall.h @@ -0,0 +1,144 @@ +#pragma once + +#include "wall.h" +#include "concepts.h" + +struct EquilibriumVelocityWallO { + +template +static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<1,0,0>) { + V rho; + rho = -(f_curr[0] + T{2.00000000000000}*f_curr[11] + f_curr[12] + f_curr[14] + T{2.00000000000000}*f_curr[15] + f_curr[16] + f_curr[18] + T{2.00000000000000}*f_curr[1] + f_curr[2] + f_curr[4] + T{2.00000000000000}*f_curr[5] + f_curr[6] + T{2.00000000000000}*f_curr[8] + f_curr[9])/(u_w + T{-1.00000000000000}); + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +template +static void apply(V f_curr[19], V f_next[19], T u_w, WallNormal<-1,0,0>) { + V rho; + rho = (f_curr[0] + T{2.00000000000000}*f_curr[10] + f_curr[12] + T{2.00000000000000}*f_curr[13] + f_curr[14] + f_curr[16] + T{2.00000000000000}*f_curr[17] + f_curr[18] + f_curr[2] + T{2.00000000000000}*f_curr[3] + f_curr[4] + f_curr[6] + T{2.00000000000000}*f_curr[7] + f_curr[9])/(u_w + T{1.00000000000000}); + V u[3] { u_w, 0., 0. }; + V e0 = T{0.0277777777777778}*rho; + V e1 = u[1] + u[2]; + V e2 = T{4.50000000000000}*(e1*e1); + V e3 = T{3.00000000000000}*u[2]; + V e4 = u[1]*u[1]; + V e5 = T{1.50000000000000}*e4; + V e6 = -e5; + V e7 = u[0]*u[0]; + V e8 = T{1.50000000000000}*e7; + V e9 = T{1.00000000000000} - e8; + V e10 = e6 + e9; + V e11 = e10 + e3; + V e12 = T{3.00000000000000}*u[1]; + V e13 = u[2]*u[2]; + V e14 = T{1.50000000000000}*e13; + V e15 = -e14; + V e16 = e12 + e15; + V e17 = T{3.00000000000000}*u[0]; + V e18 = -u[2]; + V e19 = e18 + u[0]; + V e20 = -T{4.50000000000000}*e19*e19; + V e21 = e14 + e5 + T{-1.00000000000000}; + V e22 = e21 + e8; + V e23 = e22 - e3; + V e24 = T{0.0555555555555556}*rho; + V e25 = T{3.00000000000000}*e13; + V e26 = u[0] + u[2]; + V e27 = T{4.50000000000000}*(e26*e26); + V e28 = e15 + e17; + V e29 = e18 + u[1]; + V e30 = -T{4.50000000000000}*e29*e29; + V e31 = -e12; + V e32 = u[0] - u[1]; + V e33 = -T{4.50000000000000}*e32*e32; + V e34 = e17 + e22; + V e35 = T{3.00000000000000}*e4; + V e36 = u[0] + u[1]; + V e37 = T{4.50000000000000}*(e36*e36); + V e38 = T{3.00000000000000}*e7; + V e39 = e8 + T{-1.00000000000000}; + V e40 = -e17 + e22; + V e41 = e22 + e3; + f_next[0] = e0*(e11 + e16 + e2); + f_next[1] = -e0*(e17 + e20 + e23); + f_next[2] = e24*(e11 + e25); + f_next[3] = e0*(e11 + e27 + e28); + f_next[4] = -e0*(e12 + e23 + e30); + f_next[5] = -e0*(e31 + e33 + e34); + f_next[6] = e24*(e16 + e35 + e9); + f_next[7] = e0*(e10 + e16 + e17 + e37); + f_next[8] = -e24*(e17 + e21 - e38); + f_next[9] = -T{0.333333333333333}*e22*rho; + f_next[10] = e24*(e28 + e38 + e6 + T{1.00000000000000}); + f_next[11] = -e0*(e12 + e34 - e37); + f_next[12] = -e24*(e12 + e14 - e35 + e39); + f_next[13] = -e0*(e12 + e33 + e40); + f_next[14] = -e0*(e30 + e31 + e41); + f_next[15] = -e0*(-e27 + e3 + e34); + f_next[16] = -e24*(-e25 + e3 + e39 + e5); + f_next[17] = -e0*(e20 + e3 + e40); + f_next[18] = -e0*(e12 - e2 + e41); +} + +}; diff --git a/src/LLBM/initialize.h b/src/LLBM/initialize.h new file mode 100644 index 0000000..b361217 --- /dev/null +++ b/src/LLBM/initialize.h @@ -0,0 +1,53 @@ +#pragma once + +#include "concepts.h" + +struct InitializeO { + +template +static void apply(V f_curr[19], V f_next[19]) { + f_next[0] = T{0.0277777777777778}; + f_next[1] = T{0.0277777777777778}; + f_next[2] = T{0.0555555555555556}; + f_next[3] = T{0.0277777777777778}; + f_next[4] = T{0.0277777777777778}; + f_next[5] = T{0.0277777777777778}; + f_next[6] = T{0.0555555555555556}; + f_next[7] = T{0.0277777777777778}; + f_next[8] = T{0.0555555555555556}; + f_next[9] = T{0.333333333333333}; + f_next[10] = T{0.0555555555555556}; + f_next[11] = T{0.0277777777777778}; + f_next[12] = T{0.0555555555555556}; + f_next[13] = T{0.0277777777777778}; + f_next[14] = T{0.0277777777777778}; + f_next[15] = T{0.0277777777777778}; + f_next[16] = T{0.0555555555555556}; + f_next[17] = T{0.0277777777777778}; + f_next[18] = T{0.0277777777777778}; +} + +template +static void apply(V f_curr[19], V f_next[19], T scale) { + f_next[0] = T{scale * 0.0277777777777778}; + f_next[1] = T{scale * 0.0277777777777778}; + f_next[2] = T{scale * 0.0555555555555556}; + f_next[3] = T{scale * 0.0277777777777778}; + f_next[4] = T{scale * 0.0277777777777778}; + f_next[5] = T{scale * 0.0277777777777778}; + f_next[6] = T{scale * 0.0555555555555556}; + f_next[7] = T{scale * 0.0277777777777778}; + f_next[8] = T{scale * 0.0555555555555556}; + f_next[9] = T{scale * 0.333333333333333}; + f_next[10] = T{scale * 0.0555555555555556}; + f_next[11] = T{scale * 0.0277777777777778}; + f_next[12] = T{scale * 0.0555555555555556}; + f_next[13] = T{scale * 0.0277777777777778}; + f_next[14] = T{scale * 0.0277777777777778}; + f_next[15] = T{scale * 0.0277777777777778}; + f_next[16] = T{scale * 0.0555555555555556}; + f_next[17] = T{scale * 0.0277777777777778}; + f_next[18] = T{scale * 0.0277777777777778}; +} + +}; diff --git a/src/LLBM/smagorinsky_collide.h b/src/LLBM/smagorinsky_collide.h new file mode 100644 index 0000000..1a90598 --- /dev/null +++ b/src/LLBM/smagorinsky_collide.h @@ -0,0 +1,129 @@ +#pragma once + +#include "concepts.h" + +struct SmagorinskyBgkCollideO { + +template +static void apply(V f_curr[19], V f_next[19], T tau, T smagorinsky) { + V rho; + V u[3]; + V x0 = f_curr[10] + f_curr[13] + f_curr[17]; + V x1 = f_curr[14] + f_curr[5] + f_curr[6]; + V x2 = f_curr[1] + f_curr[2] + f_curr[4]; + V x3 = x0 + x1 + x2 + f_curr[0] + f_curr[11] + f_curr[12] + f_curr[15] + f_curr[16] + f_curr[18] + f_curr[3] + f_curr[7] + f_curr[8] + f_curr[9]; + V x4 = -f_curr[11] + f_curr[7]; + V x5 = -f_curr[15] + f_curr[3]; + V x6 = T{1}/x3; + V x7 = f_curr[0] - f_curr[18]; + V x8 = T{2.00000000000000}*tau + T{-1.00000000000000}; + V x9 = T{72.0000000000000}*f_curr[5]; + V x26 = T{72.0000000000000}*f_curr[13]; + V x33 = T{72.0000000000000}*f_curr[11]; + V x44 = T{72.0000000000000}*f_curr[1]; + V x50 = T{72.0000000000000}*f_curr[17]; + V x56 = T{72.0000000000000}*f_curr[15]; + V x64 = T{72.0000000000000}*f_curr[4]; + V x68 = T{72.0000000000000}*f_curr[14]; + V x72 = T{72.0000000000000}*f_curr[18]; + rho = x3; + V x78 = T{2.00000000000000}*rho; + u[0] = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]); + V x12 = -u[0]; + V x14 = T{6.00000000000000}*u[0]; + V x17 = u[0]*u[0]; + V x18 = T{3.00000000000000}*x17; + V x27 = -x14; + V x38 = T{2.00000000000000} - x18; + V x79 = T{6.00000000000000}*x17; + u[1] = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]); + V x10 = T{6.00000000000000}*u[1]; + V x11 = -x10; + V x13 = x12 + u[1]; + V x19 = u[1]*u[1]; + V x20 = T{3.00000000000000}*x19; + V x21 = x18 + x20 + T{-2.00000000000000}; + V x28 = -u[1]; + V x29 = x28 + u[0]; + V x34 = u[0] + u[1]; + V x35 = T{9.00000000000000}*(x34*x34); + V x37 = -x20; + V x39 = x37 + x38; + V x85 = T{6.00000000000000}*x19; + u[2] = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]); + V x15 = u[2]*u[2]; + V x16 = T{3.00000000000000}*x15; + V x22 = x16 + x21; + V x23 = x14 + x22; + V x24 = rho*(x11 + x23 - T{9.00000000000000}*x13*x13); + V x25 = x24 + x9; + V x30 = x10 + x22; + V x31 = rho*(x27 + x30 - T{9.00000000000000}*x29*x29); + V x32 = x26 + x31; + V x36 = rho*(x14 + x30 - x35); + V x40 = -x16; + V x41 = x10 + x40; + V x42 = rho*(x14 + x35 + x39 + x41) - T{72.0000000000000}*f_curr[7]; + V x43 = -x33 - x36 + x42; + V x45 = T{6.00000000000000}*u[2]; + V x46 = -x45; + V x47 = x12 + u[2]; + V x48 = rho*(x23 + x46 - T{9.00000000000000}*x47*x47); + V x49 = x44 + x48; + V x51 = -u[2]; + V x52 = x51 + u[0]; + V x53 = x22 + x45; + V x54 = rho*(x27 + x53 - T{9.00000000000000}*x52*x52); + V x55 = x50 + x54; + V x57 = u[0] + u[2]; + V x58 = T{9.00000000000000}*(x57*x57); + V x59 = rho*(x14 + x53 - x58); + V x60 = x39 + x45; + V x61 = x14 + x40; + V x62 = rho*(x58 + x60 + x61) - T{72.0000000000000}*f_curr[3]; + V x63 = -x56 - x59 + x62; + V x65 = x28 + u[2]; + V x66 = rho*(x30 + x46 - T{9.00000000000000}*x65*x65); + V x67 = x64 + x66; + V x69 = x51 + u[1]; + V x70 = rho*(x11 + x53 - T{9.00000000000000}*x69*x69); + V x71 = x68 + x70; + V x73 = u[1] + u[2]; + V x74 = T{9.00000000000000}*(x73*x73); + V x75 = rho*(x30 + x45 - x74); + V x76 = rho*(x41 + x60 + x74) - T{72.0000000000000}*f_curr[0]; + V x77 = -x72 - x75 + x76; + V x80 = x16 + T{-2.00000000000000}; + V x81 = x14 + x20 - x79 + x80; + V x82 = x37 + x61 + x79 + T{2.00000000000000}; + V x83 = -x44 - x48 - x50 - x54 + x63; + V x84 = -x24 - x26 - x31 + x43 - x9; + V x86 = x10 + x18 + x80 - x85; + V x87 = x38 + x41 + x85; + V x88 = -x64 - x66 - x68 - x70 + x77; + V x89 = T{6.00000000000000}*x15; + V x90 = x21 + x45 - x89; + V x91 = x60 + x89; + V x92 = T{1}/(T{20.0000000000000}*tau + T{2.00000000000000} + T{5.04537849152229}*simd::sqrt((smagorinsky*smagorinsky)*simd::sqrt((x25 + x32 + x43)*(x25 + x32 + x43) + (x49 + x55 + x63)*(x49 + x55 + x63) + (x67 + x71 + x77)*(x67 + x71 + x77) + T{0.500000000000000}*((-x78*x81 + x78*x82 + x83 + x84 - T{72}*f_curr[10] - T{72}*f_curr[8])*(-x78*x81 + x78*x82 + x83 + x84 - T{72}*f_curr[10] - T{72}*f_curr[8])) + T{0.500000000000000}*((-x78*x86 + x78*x87 + x84 + x88 - T{72}*f_curr[12] - T{72}*f_curr[6])*(-x78*x86 + x78*x87 + x84 + x88 - T{72}*f_curr[12] - T{72}*f_curr[6])) + T{0.500000000000000}*((-x78*x90 + x78*x91 + x83 + x88 - T{72}*f_curr[16] - T{72}*f_curr[2])*(-x78*x90 + x78*x91 + x83 + x88 - T{72}*f_curr[16] - T{72}*f_curr[2]))) + T{0.157134840263677}*(x8*x8))); + f_next[0] = T{0.333333333333333}*x76*x92 + f_curr[0]; + f_next[1] = -T{0.333333333333333}*x49*x92 + f_curr[1]; + f_next[2] = T{0.666666666666667}*x92*(rho*x91 - T{36.0000000000000}*f_curr[2]) + f_curr[2]; + f_next[3] = T{0.333333333333333}*x62*x92 + f_curr[3]; + f_next[4] = -T{0.333333333333333}*x67*x92 + f_curr[4]; + f_next[5] = -T{0.333333333333333}*x25*x92 + f_curr[5]; + f_next[6] = T{0.666666666666667}*x92*(rho*x87 - T{36.0000000000000}*f_curr[6]) + f_curr[6]; + f_next[7] = T{0.333333333333333}*x42*x92 + f_curr[7]; + f_next[8] = -T{0.666666666666667}*x92*(rho*x81 + T{36.0000000000000}*f_curr[8]) + f_curr[8]; + f_next[9] = -T{4.00000000000000}*x92*(rho*x22 + T{6.00000000000000}*f_curr[9]) + f_curr[9]; + f_next[10] = T{0.666666666666667}*x92*(rho*x82 - T{36.0000000000000}*f_curr[10]) + f_curr[10]; + f_next[11] = -T{0.333333333333333}*x92*(x33 + x36) + f_curr[11]; + f_next[12] = -T{0.666666666666667}*x92*(rho*x86 + T{36.0000000000000}*f_curr[12]) + f_curr[12]; + f_next[13] = -T{0.333333333333333}*x32*x92 + f_curr[13]; + f_next[14] = -T{0.333333333333333}*x71*x92 + f_curr[14]; + f_next[15] = -T{0.333333333333333}*x92*(x56 + x59) + f_curr[15]; + f_next[16] = -T{0.666666666666667}*x92*(rho*x90 + T{36.0000000000000}*f_curr[16]) + f_curr[16]; + f_next[17] = -T{0.333333333333333}*x55*x92 + f_curr[17]; + f_next[18] = -T{0.333333333333333}*x92*(x72 + x75) + f_curr[18]; +} + +}; diff --git a/src/LLBM/wall.h b/src/LLBM/wall.h new file mode 100644 index 0000000..26534e4 --- /dev/null +++ b/src/LLBM/wall.h @@ -0,0 +1,4 @@ +#pragma once + +template +struct WallNormal { }; diff --git a/src/concepts.h b/src/concepts.h new file mode 100644 index 0000000..61f7cbf --- /dev/null +++ b/src/concepts.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace concepts { + +template +concept Arithmetic = requires(V a, V b) { + a + b; + a += b; + + a - b; + a -= b; + + a * b; + a *= b; + + a / b; + a /= b; +}; + +} diff --git a/src/cuboid.h b/src/cuboid.h new file mode 100644 index 0000000..c4ff803 --- /dev/null +++ b/src/cuboid.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +class Cuboid { +private: + const std::array _size; + const std::size_t _volume; + +public: + Cuboid(std::size_t nX, std::size_t nY, std::size_t nZ): + _size{nX, nY, nZ}, + _volume(nX*nY*nZ) { } + + std::size_t volume() const { + return _volume; + } + + std::size_t index(unsigned iX, unsigned iY, unsigned iZ) const { + return iX*_size[1]*_size[2] + iY*_size[2] + iZ; + } + + int operator[](unsigned i) const { + return _size[i]; + } + + template + void traverse(F f) { + for (int iX = 0; iX < _size[0]; ++iX) { + for (int iY = 0; iY < _size[1]; ++iY) { + for (int iZ = 0; iZ < _size[2]; ++iZ) { + f(iX, iY, iZ, index(iX,iY,iZ)); + } + } + } + } +}; diff --git a/src/export.h b/src/export.h new file mode 100644 index 0000000..a3d38ec --- /dev/null +++ b/src/export.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include + +namespace export_format { + +template +void vtk(Cuboid c, std::span rho, std::span u, std::string path) { + std::ofstream fout; + fout.open(path.c_str()); + + fout << "# vtk DataFile Version 3.0\n"; + fout << "lbm_output\n"; + fout << "ASCII\n"; + fout << "DATASET RECTILINEAR_GRID\n"; + fout << "DIMENSIONS " << c[0] << " " << c[1] << " " << c[2] << "\n"; + + fout << "X_COORDINATES " << c[0] << " float\n"; + for (std::size_t iX=0; iX < c[0]; ++iX) { + fout << iX << " "; + } + + fout << "\nY_COORDINATES " << c[1] << " float\n"; + for (std::size_t iY=0; iY < c[1]; ++iY) { + fout << iY << " "; + } + + fout << "\nZ_COORDINATES " << c[2] << " float\n"; + for (std::size_t iZ=0; iZ < c[2]; ++iZ) { + fout << iZ << " "; + } + fout << "\nPOINT_DATA " << c.volume() << "\n"; + + fout << "VECTORS velocity float\n"; + for (std::size_t iZ=0; iZ < c[2]; ++iZ) { + for (std::size_t iY=0; iY < c[1]; ++iY) { + for (std::size_t iX=0; iX < c[0]; ++iX) { + const std::size_t iCell = c.index(iX,iY,iZ); + fout << u[iCell] << " " << u[c.volume() + iCell] << " " << u[2*c.volume() + iCell] << "\n"; + } + } + } + + fout << "SCALARS density float 1\n"; + fout << "LOOKUP_TABLE default\n"; + for (std::size_t iZ=0; iZ < c[2]; ++iZ) { + for (std::size_t iY=0; iY < c[1]; ++iY) { + for (std::size_t iX=0; iX < c[0]; ++iX) { + const std::size_t iCell = c.index(iX,iY,iZ); + fout << rho[iCell] << "\n"; + } + } + } + + fout.close(); +} + +} diff --git a/src/lattice.h b/src/lattice.h new file mode 100644 index 0000000..bbebab4 --- /dev/null +++ b/src/lattice.h @@ -0,0 +1,267 @@ +#pragma once + +#include +#include + +#ifdef ENABLE_AVX512 +#include "simd/512.h" +#else +#include "simd/256.h" +#endif + +#include + +#include "cuboid.h" +#include "population.h" +#include "propagation.h" +#include "operator.h" +#include "export.h" + +#include "LLBM/initialize.h" +#include "LLBM/collect_moments.h" + +template +class LatticeMask { +private: + const std::size_t _size; + + std::unique_ptr::storage_t[]> _simd_mask; + std::unique_ptr _base_mask; + +public: + LatticeMask(std::size_t nCells): + _size(((nCells + simd::Pack::size - 1) / simd::Mask::storage_size + 1) * simd::Mask::storage_size), + _simd_mask(new typename simd::Mask::storage_t[_size / simd::Mask::storage_size] { }), + _base_mask(new bool[_size] { }) + { } + + void set(std::size_t iCell, bool state) { + _base_mask[iCell] = state; + } + + bool get(std::size_t iCell) const { + return _base_mask[iCell]; + } + + void serialize() { + for (std::size_t i=0; i < _size / simd::Mask::storage_size; ++i) { + _simd_mask[i] = simd::Mask::encode(_base_mask.get() + i*simd::Mask::storage_size); + } + } + + typename simd::Mask::storage_t* data() { + return _simd_mask.get(); + } + +}; + +template +class Lattice { +private: + using value_t = typename Buffer::value_t; + using pack_t = simd::Pack; + using mask_t = simd::Mask; + + Cuboid _cuboid; + Buffer _population; + + template + void apply(COP cop, + std::size_t iCell, + pack_t f_curr[population::q], + pack_t f_next[population::q]) { + if (mask_t m = {cop.mask.data(), iCell}) { + cop(f_curr, f_next); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::maskstore(get(iPop, stage::post_collision()) + iCell, m, f_next[iPop]); + } + } + } + +public: + Lattice(Cuboid cuboid): + _cuboid(cuboid), + _population(cuboid) + { + apply(); + } + + template + value_t* get(unsigned iPop, STAGE stage) { + return _population.get(iPop, stage); + } + + std::size_t volume() const { + return _cuboid.volume(); + } + + void stream() { + _population.stream(); + } + + template + requires concepts::Operator + void apply(std::size_t iCell, ARGS&&... args) { + value_t f_curr[population::q]; + value_t f_next[population::q]; + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = get(iPop, stage::pre_collision())[iCell]; + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + get(iPop, stage::post_collision())[iCell] = f_next[iPop]; + } + } + + template + requires concepts::Operator + void apply() { + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t i=0; i < volume() / pack_t::size; ++i) { + const std::size_t iCell = pack_t::size * i; + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + F::apply(f_curr, f_next); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::store(get(iPop, stage::post_collision()) + iCell, f_next[iPop]); + } + } + + for (std::size_t iCell = pack_t::size * (volume() / pack_t::size); + iCell < volume(); + ++iCell) { + apply(iCell); + } + } + + template + requires concepts::Operator + void apply(LatticeMask& mask, ARGS&&... args) { + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) { + mask_t m(mask.data(), iCell); + if (m) { + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::maskstore(get(iPop, stage::post_collision()) + iCell, m, f_next[iPop]); + } + } + } + } + + template + void apply(COPs&&... cops) { + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) { + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + (apply(cops, iCell, f_curr, f_next), ...); + } + } + + template + void apply(std::vector& cells, ARGS&&... args) { + #ifdef SIMD_CELL_LIST + pack_t f_curr[population::q]; + pack_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t i=0; i < cells.size() / pack_t::size; ++i) { + const std::size_t iIdx = pack_t::size * i; + pack_t::index_t* locs = cells.data() + iIdx; + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = pack_t(get(iPop, stage::pre_collision()), locs); + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + simd::store(get(iPop, stage::post_collision()), f_next[iPop], locs); + } + } + + for (std::size_t i = pack_t::size * (cells.size() / pack_t::size); + i < cells.size(); + ++i) { + apply(cells[i], std::forward(args)...); + } + #else + value_t f_curr[population::q]; + value_t f_next[population::q]; + #pragma omp parallel for private(f_curr, f_next) schedule(static) + for (std::size_t i=0; i < cells.size(); ++i) { + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_curr[iPop] = get(iPop, stage::pre_collision())[cells[i]]; + } + + F::apply(f_curr, f_next, std::forward(args)...); + + #pragma GCC unroll population::q + for (unsigned iPop=0; iPop < population::q; ++iPop) { + get(iPop, stage::post_collision())[cells[i]] = f_next[iPop]; + } + } + #endif + } + + template + requires concepts::Functor + void observe(LatticeMask& mask, std::span buffer) { + pack_t f[population::q]; + pack_t output[F::size]; + #pragma omp parallel for private(f,output) schedule(static) + for (std::size_t iCell=0; iCell < volume(); iCell += pack_t::size) { + mask_t m(mask.data(), iCell); + if (m) { + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f[iPop] = pack_t(get(iPop, stage::pre_collision()) + iCell); + } + + CollectMomentsF::observe(f, output); + + for (unsigned i=0; i < F::size; ++i) { + simd::maskstore(buffer.data() + i*volume() + iCell, m, output[i]); + } + } + } + } + + void write_momenta(LatticeMask& mask, std::string path) { + std::vector buffer(CollectMomentsF::size*volume()); + observe(mask, buffer); + export_format::vtk(_cuboid, + { buffer.data(), 1*volume() }, + { buffer.data() + volume(), 3*volume() }, + path); + } +}; diff --git a/src/operator.h b/src/operator.h new file mode 100644 index 0000000..b4830eb --- /dev/null +++ b/src/operator.h @@ -0,0 +1,41 @@ +#pragma once + +#include + +#include "concepts.h" + +namespace concepts { + +template +concept Operator = Arithmetic + && requires(F op, T* f_curr, T* f_next, ARGS... args) { + F::apply(f_curr, f_next, args...); +}; + +template +concept Functor = Arithmetic + && requires(F f, T* f_curr, T output[F::size]) { + F::observe(f_curr, output); +}; + +} + +template +struct Operator { + M& mask; + std::tuple args; + + Operator(F, M& m, ARGS&&... args): + mask(m), + args(std::forward(args)...) { } + + template + requires concepts::Operator + void operator()(U f_curr[], U f_next[]) { + std::apply([](auto... args) { F::apply(args...); }, + std::tuple_cat(std::make_tuple(f_curr, f_next), args)); + } +}; + +template +Operator(F, M&, ARGS&&... args) -> Operator; diff --git a/src/pattern/all.h b/src/pattern/all.h new file mode 100644 index 0000000..6496e02 --- /dev/null +++ b/src/pattern/all.h @@ -0,0 +1,5 @@ +#pragma once + +#include "pattern/ps.h" +#include "pattern/sss.h" +#include "pattern/none.h" diff --git a/src/pattern/none.h b/src/pattern/none.h new file mode 100644 index 0000000..8abc419 --- /dev/null +++ b/src/pattern/none.h @@ -0,0 +1,48 @@ +#pragma once + +#include "population.h" +#include "propagation.h" + +#include + +namespace pattern { + +template +class NONE { +private: + Cuboid _cuboid; + + T* _base; + T* _f[population::q]; + +public: + using value_t = T; + + NONE(Cuboid cuboid): + _cuboid(cuboid) + { + const std::size_t pagesize = sysconf(_SC_PAGESIZE); + const std::size_t padding = _cuboid.index(1,1,1); + _base = static_cast( + std::aligned_alloc(pagesize, population::q * (_cuboid.volume() + 2*padding) * sizeof(T))); + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _f[iPop] = _base + iPop * (_cuboid.volume() + 2*padding) + padding; + } + } + + ~NONE() { + delete [] _base; + } + + T* get(unsigned iPop, stage::pre_collision) { + return _f[iPop]; + } + + T* get(unsigned iPop, stage::post_collision) { + return _f[iPop]; + } + + void stream() { } +}; + +} diff --git a/src/pattern/ps.h b/src/pattern/ps.h new file mode 100644 index 0000000..667b09f --- /dev/null +++ b/src/pattern/ps.h @@ -0,0 +1,123 @@ +#pragma once + +#include "population.h" +#include "propagation.h" + +#include +#include +#include + +const int PROT_RW = PROT_READ | PROT_WRITE; + +constexpr std::uint32_t log2(std::uint32_t x) { + return 31 - __builtin_clz(x); +} + +namespace pattern { + +template +class PS { +private: + Cuboid _cuboid; + std::ptrdiff_t _volume; + + std::string _shm_path; + int _shm_name; + int _shm_file; + + std::uint8_t* _base_buffer; + std::uint8_t* _buffer[population::q]; + + T* _base[population::q]; + T* _f[population::q]; + +public: + using value_t = T; + + PS(Cuboid cuboid): + _cuboid(cuboid) + { + const std::size_t page_size = sysconf(_SC_PAGESIZE); + const std::size_t line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + const std::size_t size = ((_cuboid.volume() * sizeof(T) - 1) / page_size + 1) * page_size; + _volume = size / sizeof(T); + + if (size % page_size != 0) { + throw std::invalid_argument("Array size must be multiple of PAGE_SIZE"); + } + + _shm_path = "/lbm_XXXXXX"; + _shm_name = mkstemp(const_cast(_shm_path.data())); + if (_shm_name != -1) { + throw std::runtime_error("Could not generate unique shared memory object name"); + } + // Open shared memory object as physical lattice memory + _shm_file = shm_open(_shm_path.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (_shm_file == -1) { + throw std::runtime_error("Failed to create shared memory object"); + } + // Resize to fit lattice populations + if (ftruncate(_shm_file, population::q * size) == -1) { + throw std::runtime_error("Failed to resize shared memory object"); + } + + // Allocate virtual address space for q times two consecutive lattices + _base_buffer = static_cast( + mmap(NULL, population::q * 2 * size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _buffer[iPop] = _base_buffer + iPop * 2 * size; + // Map single physical lattice into virtual address space + mmap(_buffer[iPop] , size, PROT_RW, MAP_SHARED | MAP_FIXED, _shm_file, iPop * size); + mmap(_buffer[iPop] + size, size, PROT_RW, MAP_SHARED | MAP_FIXED, _shm_file, iPop * size); + + // Store base pointer for reference + _base[iPop] = reinterpret_cast(_buffer[iPop]); + // Initialize shiftable f pointer to be used for lattice access + _f[iPop] = _base[iPop]; + } + + // Pre-shift to reduce cache and TLB conflict misses + // due to alignment of start address and page-boundaries every `page_size/sizeof(T)`-steps + for (unsigned iPop=0; iPop < population::q; ++iPop) { + std::ptrdiff_t shift = iPop * (1 << log2(line_size)) + + iPop * (1 << log2(page_size)); + shift /= sizeof(T); + if (shift < _volume-1) { + _f[iPop] += shift; + } + } + } + + ~PS() { + munmap(_base_buffer, population::q * 2 * _volume * sizeof(T)); + shm_unlink(_shm_path.c_str()); + close(_shm_name); + unlink(_shm_path.c_str()); + } + + T* get(unsigned iPop, stage::pre_collision) { + return _f[iPop]; + } + + T* get(unsigned iPop, stage::post_collision) { + return _f[iPop]; + } + + void stream() { + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _f[iPop] -= population::offset(_cuboid, iPop); + } + + for (unsigned iPop=0; iPop < population::q; ++iPop) { + if (_f[iPop] - _base[iPop] >= _volume) { + _f[iPop] -= _volume; + } else if (_f[iPop] < _base[iPop]) { + _f[iPop] += _volume; + } + } + } + +}; + +} diff --git a/src/pattern/sss.h b/src/pattern/sss.h new file mode 100644 index 0000000..1f1985d --- /dev/null +++ b/src/pattern/sss.h @@ -0,0 +1,63 @@ +#pragma once + +#include "population.h" +#include "propagation.h" + +#include "cuboid.h" +#include "concepts.h" +#include "population.h" + +#include + +namespace pattern { + +template +class SSS { +private: + Cuboid _cuboid; + + T* _base; + T* _buffer[population::q]; + T* _f[population::q]; + +public: + using value_t = T; + + SSS(Cuboid cuboid): + _cuboid(cuboid) + { + const std::size_t pagesize = sysconf(_SC_PAGESIZE); + const std::ptrdiff_t padding = population::max_offset(_cuboid); + _base = static_cast( + std::aligned_alloc(pagesize, population::q * (_cuboid.volume() + 2*padding) * sizeof(T))); + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _buffer[iPop] = _base + iPop * (_cuboid.volume() + 2*padding); + _f[iPop] = _buffer[iPop] + padding; + } + } + + ~SSS() { + delete [] _base; + } + + T* get(unsigned iPop, stage::pre_collision) { + return _f[iPop]; + } + + T* get(unsigned iPop, stage::post_collision) { + return _f[population::opposite(iPop)]; + } + + void stream() { + T* f_old[population::q]; + for (unsigned iPop=0; iPop < population::q; ++iPop) { + f_old[iPop] = _f[iPop]; + } + for (unsigned iPop=0; iPop < population::q; ++iPop) { + _f[iPop] = f_old[population::opposite(iPop)] - population::offset(_cuboid, iPop); + } + } + +}; + +} diff --git a/src/population.h b/src/population.h new file mode 100644 index 0000000..14c9b77 --- /dev/null +++ b/src/population.h @@ -0,0 +1,41 @@ +#pragma once + +namespace population { + +constexpr unsigned d = 3; +constexpr unsigned q = 19; + +constexpr int velocity[q][d] = { + { 0, 1, 1}, {-1, 0, 1}, { 0, 0, 1}, { 1, 0, 1}, { 0,-1, 1}, + {-1, 1, 0}, { 0, 1, 0}, { 1, 1, 0}, {-1, 0, 0}, { 0, 0, 0}, { 1, 0, 0}, {-1,-1, 0}, { 0,-1, 0}, { 1,-1, 0}, + { 0, 1,-1}, {-1, 0,-1}, { 0, 0,-1}, { 1, 0,-1}, { 0,-1,-1} +}; + +unsigned opposite(unsigned iPop) { + return q - 1 - iPop; +} + +std::ptrdiff_t offset(const Cuboid& c, int iX, int iY, int iZ) { + return iX*c[1]*c[2] + iY*c[2] + iZ; +} + +std::ptrdiff_t offset(const Cuboid& c, unsigned iPop) { + return offset(c, velocity[iPop][0], velocity[iPop][1], velocity[iPop][2]); +} + +std::ptrdiff_t max_offset(const Cuboid& c) { + std::ptrdiff_t padding = 0; + for (unsigned iPop=0; iPop < q; ++iPop) { + padding = std::max(padding, std::abs(offset(c, iPop))); + } + return padding; +} + +} + +namespace stage { + +struct pre_collision { }; +struct post_collision { }; + +} diff --git a/src/propagation.h b/src/propagation.h new file mode 100644 index 0000000..0bcb435 --- /dev/null +++ b/src/propagation.h @@ -0,0 +1,28 @@ +#pragma once + +#include "cuboid.h" +#include "concepts.h" + +namespace concepts { + +template +concept CuboidConstructible = std::constructible_from; + +template +concept PopulationAccess = Arithmetic + && requires(BUFFER& b) { + { b.get(unsigned{}, stage::pre_collision()) } -> std::same_as; + { b.get(unsigned{}, stage::post_collision()) } -> std::same_as; +}; + +template +concept Propagatable = requires(BUFFER& b) { + b.stream(); +}; + +template +concept PropagatablePopulationBuffer = CuboidConstructible + && PopulationAccess + && Propagatable; + +} diff --git a/src/simd/256.h b/src/simd/256.h new file mode 100644 index 0000000..a3f419d --- /dev/null +++ b/src/simd/256.h @@ -0,0 +1,379 @@ +#pragma once + +#include + +#include +#include + +namespace simd { + +template +class Mask; + +template <> +class Mask { +private: + __m256i _reg; + +public: + using storage_t = std::uint64_t; + static constexpr unsigned storage_size = 1; + + static constexpr storage_t true_v = 1l << 63; + static constexpr storage_t false_v = 0l; + + static storage_t encode(bool value) { + return value ? true_v : false_v; + } + + static storage_t encode(bool* value) { + return encode(*value); + } + + Mask(bool a, bool b, bool c, bool d): + _reg(_mm256_set_epi64x(encode(d),encode(c),encode(b),encode(a))) { } + + Mask(std::uint64_t a, std::uint64_t b, std::uint64_t c, std::uint64_t d): + _reg(_mm256_set_epi64x(d,c,b,a)) { } + + Mask(std::uint64_t* ptr): + _reg(_mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr))) { } + + Mask(storage_t* ptr, std::size_t iCell): + Mask(ptr + iCell) { } + + Mask(__m256i reg): + _reg(reg) { } + + operator __m256i() { + return _reg; + } + + __m256i neg() const { + return _mm256_sub_epi64(_mm256_set1_epi64x(true_v), _reg); + } + + operator bool() const { + const std::uint64_t* values = reinterpret_cast(&_reg); + return values[0] == true_v + || values[1] == true_v + || values[2] == true_v + || values[3] == true_v; + } +}; + +template <> +class Mask { +private: + __m256i _reg; + +public: + using storage_t = std::uint32_t; + static constexpr unsigned storage_size = 1; + + static constexpr storage_t true_v = 1 << 31; + static constexpr storage_t false_v = 0; + + static storage_t encode(bool value) { + return value ? true_v : false_v; + } + + static storage_t encode(bool* value) { + return encode(*value); + } + + Mask(storage_t* ptr): + _reg(_mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr))) { } + + Mask(storage_t* ptr, std::size_t iCell): + Mask(ptr + iCell) { } + + Mask(__m256i reg): + _reg(reg) { } + + operator __m256i() { + return _reg; + } + + __m256i neg() const { + return _mm256_sub_epi32(_mm256_set1_epi32(true_v), _reg); + } + + operator bool() const { + const std::uint32_t* values = reinterpret_cast(&_reg); + return values[0] == true_v + || values[1] == true_v + || values[2] == true_v + || values[3] == true_v + || values[4] == true_v + || values[5] == true_v + || values[6] == true_v + || values[7] == true_v; + } +}; + + +template +class Pack; + +template <> +class Pack { +private: + __m256d _reg; + +public: + using mask_t = Mask; + using index_t = std::uint32_t; + + static constexpr std::size_t size = 4; + + Pack() = default; + + Pack(__m256d reg): + _reg(reg) { } + + Pack(double val): + Pack(_mm256_set1_pd(val)) { } + + Pack(double a, double b, double c, double d): + Pack(_mm256_set_pd(d,c,b,a)) { } + + Pack(double* ptr): + Pack(_mm256_loadu_pd(ptr)) { + _mm_prefetch(ptr + size, _MM_HINT_T2); + } + + Pack(double* ptr, index_t* idx): + Pack(_mm256_i32gather_pd(ptr, _mm_loadu_si128(reinterpret_cast<__m128i*>(idx)), sizeof(double))) { } + + operator __m256d() { + return _reg; + } + + Pack operator+(Pack rhs) { + return Pack(_mm256_add_pd(_reg, rhs)); + } + + Pack& operator+=(Pack rhs) { + _reg = _mm256_add_pd(_reg, rhs); + return *this; + } + + Pack operator-(Pack rhs) { + return Pack(_mm256_sub_pd(_reg, rhs)); + } + + Pack& operator-=(Pack rhs) { + _reg = _mm256_sub_pd(_reg, rhs); + return *this; + } + + Pack operator*(Pack rhs) { + return Pack(_mm256_mul_pd(_reg, rhs)); + } + + Pack& operator*=(Pack rhs) { + _reg = _mm256_mul_pd(_reg, rhs); + return *this; + } + + Pack operator/(Pack rhs) { + return Pack(_mm256_div_pd(_reg, rhs)); + } + + Pack& operator/=(Pack rhs) { + _reg = _mm256_div_pd(_reg, rhs); + return *this; + } + + Pack operator-() { + return *this * Pack(-1.); + } + + __m256d sqrt() { + return _mm256_sqrt_pd(_reg); + } +}; + +template <> +class Pack { +private: + __m256 _reg; + +public: + using mask_t = Mask; + using index_t = std::uint32_t; + + static constexpr std::size_t size = 8; + + Pack() = default; + + Pack(__m256 reg): + _reg(reg) { } + + Pack(float val): + Pack(_mm256_set1_ps(val)) { } + + Pack(float a, float b, float c, float d, float e, float f, float g, float h): + Pack(_mm256_set_ps(h,g,f,e,d,c,b,a)) { } + + Pack(float* ptr): + Pack(_mm256_loadu_ps(ptr)) { + _mm_prefetch(ptr + size, _MM_HINT_T2); + } + + Pack(float* ptr, index_t* idx): + Pack(_mm256_i32gather_ps(ptr, _mm256_loadu_si256(reinterpret_cast<__m256i*>(idx)), sizeof(float))) { } + + operator __m256() { + return _reg; + } + + Pack operator+(Pack rhs) { + return Pack(_mm256_add_ps(_reg, rhs)); + } + + Pack& operator+=(Pack rhs) { + _reg = _mm256_add_ps(_reg, rhs); + return *this; + } + + Pack operator-(Pack rhs) { + return Pack(_mm256_sub_ps(_reg, rhs)); + } + + Pack& operator-=(Pack rhs) { + _reg = _mm256_sub_ps(_reg, rhs); + return *this; + } + + Pack operator*(Pack rhs) { + return Pack(_mm256_mul_ps(_reg, rhs)); + } + + Pack& operator*=(Pack rhs) { + _reg = _mm256_mul_ps(_reg, rhs); + return *this; + } + + Pack operator/(Pack rhs) { + return Pack(_mm256_div_ps(_reg, rhs)); + } + + Pack& operator/=(Pack rhs) { + _reg = _mm256_div_ps(_reg, rhs); + return *this; + } + + Pack operator-() { + return *this * Pack(-1.); + } + + __m256 sqrt() { + return _mm256_sqrt_ps(_reg); + } +}; + + +template +Pack operator+(T lhs, Pack rhs) { + return Pack(lhs) + rhs; +} + +template +Pack operator+(Pack lhs, T rhs) { + return lhs + Pack(rhs); +} + +template +Pack operator-(T lhs, Pack rhs) { + return Pack(lhs) - rhs; +} + +template +Pack operator-(Pack lhs, T rhs) { + return lhs - Pack(rhs); +} + +template +Pack operator*(Pack lhs, T rhs) { + return lhs * Pack(rhs); +} + +template +Pack operator*(T lhs, Pack rhs) { + return Pack(lhs) * rhs; +} + +template +Pack operator/(Pack lhs, T rhs) { + return lhs / Pack(rhs); +} + +template +Pack operator/(T lhs, Pack rhs) { + return Pack(lhs) / rhs; +} + +template +Pack sqrt(Pack x) { + return x.sqrt(); +} + +template +void maskstore(T* target, Mask mask, Pack value); + +template <> +void maskstore(double* target, Mask mask, Pack value) { + _mm256_maskstore_pd(target, mask, value); +} + +template <> +void maskstore(float* target, Mask mask, Pack value) { + _mm256_maskstore_ps(target, mask, value); +} + + +template +void store(T* target, Pack value); + +template <> +void store(double* target, Pack value) { + _mm256_storeu_pd(target, value); +} + +template <> +void store(float* target, Pack value) { + _mm256_storeu_ps(target, value); +} + +template +void store(T* target, Pack value, typename Pack::index_t* indices); + +template <> +void store(double* target, Pack value, Pack::index_t* indices) { +#ifdef __AVX512F__ + _mm256_i32scatter_pd(target, _mm_loadu_si128(reinterpret_cast<__m128i*>(indices)), value, sizeof(double)); +#else + __m256d reg = value; + #pragma GCC unroll 4 + for (unsigned i=0; i < simd::Pack::size; ++i) { + target[indices[i]] = reg[i]; + } +#endif +} + +template <> +void store(float* target, Pack value, Pack::index_t* indices) { +#ifdef __AVX512F__ + _mm256_i32scatter_ps(target, _mm256_loadu_si256(reinterpret_cast<__m256i*>(indices)), value, sizeof(float)); +#else + __m256 reg = value; + #pragma GCC unroll 8 + for (unsigned i=0; i < simd::Pack::size; ++i) { + target[indices[i]] = reg[i]; + } +#endif +} + +} diff --git a/src/simd/512.h b/src/simd/512.h new file mode 100644 index 0000000..2cc0a44 --- /dev/null +++ b/src/simd/512.h @@ -0,0 +1,339 @@ +#pragma once + +#include + +#include +#include + +namespace simd { + + +template +class Mask; + +template <> +class Mask { +private: + __mmask8 _reg; + +public: + using storage_t = std::uint8_t; + static constexpr unsigned storage_size = 8; + + static storage_t encode(bool* value) { + storage_t mask = value[0]; + for (unsigned j=1; j < storage_size; ++j) { + mask |= value[j] << j; + } + return mask; + } + + Mask(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7): + _reg(std::uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7)) { } + + Mask(std::uint8_t* ptr): + _reg(_load_mask16(reinterpret_cast(ptr))) { } + + Mask(storage_t* ptr, std::size_t iCell): + Mask(ptr + iCell / storage_size) { } + + Mask(__mmask8 reg): + _reg(reg) { } + + operator __mmask8() { + return _reg; + } + + __mmask8 neg() const { + return _knot_mask8(_reg); + } + + operator bool() const { + const std::uint8_t* value = reinterpret_cast(&_reg); + return value[0] != 0; + } +}; + +template <> +class Mask { +private: + __mmask16 _reg; + +public: + using storage_t = std::uint16_t; + static constexpr unsigned storage_size = 16; + + static storage_t encode(bool* value) { + storage_t mask = value[0]; + for (unsigned j=1; j < storage_size; ++j) { + mask |= value[j] << j; + } + return mask; + } + + Mask(std::uint16_t* ptr): + _reg(_load_mask16(ptr)) { } + + Mask(storage_t* ptr, std::size_t iCell): + Mask(ptr + iCell / storage_size) { } + + Mask(__mmask16 reg): + _reg(reg) { } + + operator __mmask16() { + return _reg; + } + + __mmask16 neg() const { + return _knot_mask16(_reg); + } + + operator bool() const { + const std::uint16_t* value = reinterpret_cast(&_reg); + return value[0] != 0; + } +}; + + +template +class Pack; + +template <> +class Pack { +private: + __m512d _reg; + +public: + using mask_t = Mask; + using index_t = std::uint32_t; + + static constexpr std::size_t size = 8; + + Pack() = default; + + Pack(__m512d reg): + _reg(reg) { } + + Pack(double val): + Pack(_mm512_set1_pd(val)) { } + + Pack(double a, double b, double c, double d, double e, double f, double g, double h): + Pack(_mm512_set_pd(h,g,f,e,d,c,b,a)) { } + + Pack(double* ptr): + Pack(_mm512_loadu_pd(ptr)) { } + + Pack(double* ptr, index_t* idx): + Pack(_mm512_i32gather_pd(_mm256_loadu_si256(reinterpret_cast<__m256i*>(idx)), ptr, sizeof(double))) { } + + operator __m512d() { + return _reg; + } + + Pack operator+(Pack rhs) { + return Pack(_mm512_add_pd(_reg, rhs)); + } + + Pack& operator+=(Pack rhs) { + _reg = _mm512_add_pd(_reg, rhs); + return *this; + } + + Pack operator-(Pack rhs) { + return Pack(_mm512_sub_pd(_reg, rhs)); + } + + Pack& operator-=(Pack rhs) { + _reg = _mm512_sub_pd(_reg, rhs); + return *this; + } + + Pack operator*(Pack rhs) { + return Pack(_mm512_mul_pd(_reg, rhs)); + } + + Pack& operator*=(Pack rhs) { + _reg = _mm512_mul_pd(_reg, rhs); + return *this; + } + + Pack operator/(Pack rhs) { + return Pack(_mm512_div_pd(_reg, rhs)); + } + + Pack& operator/=(Pack rhs) { + _reg = _mm512_div_pd(_reg, rhs); + return *this; + } + + Pack operator-() { + return *this * Pack(-1.); + } + + __m512d sqrt() { + return _mm512_sqrt_pd(_reg); + } +}; + +template <> +class Pack { +private: + __m512 _reg; + +public: + using mask_t = Mask; + using index_t = std::uint32_t; + + static constexpr std::size_t size = 16; + + Pack() = default; + + Pack(__m512 reg): + _reg(reg) { } + + Pack(float val): + Pack(_mm512_set1_ps(val)) { } + + Pack(float a, float b, float c, float d, float e, float f, float g, float h, float i, float j, float k, float l, float m, float n, float o, float p): + Pack(_mm512_set_ps(p,o,n,m,l,k,j,i,h,g,f,e,d,c,b,a)) { } + + Pack(float* ptr): + Pack(_mm512_loadu_ps(ptr)) { } + + Pack(float* ptr, index_t* idx): + Pack(_mm512_i32gather_ps(_mm512_loadu_si512(reinterpret_cast<__m512i*>(idx)), ptr, sizeof(float))) { } + + operator __m512() { + return _reg; + } + + Pack operator+(Pack rhs) { + return Pack(_mm512_add_ps(_reg, rhs)); + } + + Pack& operator+=(Pack rhs) { + _reg = _mm512_add_ps(_reg, rhs); + return *this; + } + + Pack operator-(Pack rhs) { + return Pack(_mm512_sub_ps(_reg, rhs)); + } + + Pack& operator-=(Pack rhs) { + _reg = _mm512_sub_ps(_reg, rhs); + return *this; + } + + Pack operator*(Pack rhs) { + return Pack(_mm512_mul_ps(_reg, rhs)); + } + + Pack& operator*=(Pack rhs) { + _reg = _mm512_mul_ps(_reg, rhs); + return *this; + } + + Pack operator/(Pack rhs) { + return Pack(_mm512_div_ps(_reg, rhs)); + } + + Pack& operator/=(Pack rhs) { + _reg = _mm512_div_ps(_reg, rhs); + return *this; + } + + Pack operator-() { + return *this * Pack(-1.); + } + + __m512 sqrt() { + return _mm512_sqrt_ps(_reg); + } +}; + + +template +Pack operator+(T lhs, Pack rhs) { + return Pack(lhs) + rhs; +} + +template +Pack operator+(Pack lhs, T rhs) { + return lhs + Pack(rhs); +} + +template +Pack operator-(T lhs, Pack rhs) { + return Pack(lhs) - rhs; +} + +template +Pack operator-(Pack lhs, T rhs) { + return lhs - Pack(rhs); +} + +template +Pack operator*(Pack lhs, T rhs) { + return lhs * Pack(rhs); +} + +template +Pack operator*(T lhs, Pack rhs) { + return Pack(lhs) * rhs; +} + +template +Pack operator/(Pack lhs, T rhs) { + return lhs / Pack(rhs); +} + +template +Pack operator/(T lhs, Pack rhs) { + return Pack(lhs) / rhs; +} + + +template +void maskstore(T* target, Mask mask, Pack value); + +template <> +void maskstore(double* target, Mask mask, Pack value) { + _mm512_mask_storeu_pd(target, mask, value); +} + +template <> +void maskstore(float* target, Mask mask, Pack value) { + _mm512_mask_storeu_ps(target, mask, value); +} + + +template +void store(T* target, Pack value); + +template <> +void store(double* target, Pack value) { + _mm512_storeu_pd(target, value); +} + +template <> +void store(float* target, Pack value) { + _mm512_storeu_ps(target, value); +} + + +template +void store(T* target, Pack value, typename Pack::index_t* indices); + +template <> +void store(double* target, Pack value, Pack::index_t* indices) { + _mm512_i32scatter_pd(target, _mm256_loadu_si256(reinterpret_cast<__m256i*>(indices)), value, sizeof(double)); +} + + +template <> +void store(float* target, Pack value, Pack::index_t* indices) { + _mm512_i32scatter_ps(target, _mm512_loadu_si512(reinterpret_cast<__m512i*>(indices)), value, sizeof(float)); +} + +} -- cgit v1.2.3