diff options
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | lbm.org | 104 | ||||
-rw-r--r-- | tangle/LLBM/kernel/collect_q_criterion.h | 72 | ||||
-rw-r--r-- | tangle/LLBM/kernel/collect_shear_layer_normal.h | 147 | ||||
-rw-r--r-- | tangle/LLBM/kernel/smagorinsky_collide.h | 140 |
5 files changed, 242 insertions, 223 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index abe1087..ba28533 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream per-thread --extended-lambda") -set(CMAKE_CUDA_ARCHITECTURES 50) +set(CMAKE_CUDA_ARCHITECTURES 62) include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/tangle @@ -270,6 +270,7 @@ a custom =ReplaceOptim= structure during the CSE optimization step that conditio #+BEGIN_SRC python :session :results none from sympy.codegen.rewriting import ReplaceOptim +from sympy.simplify import cse_main expand_pos_square = ReplaceOptim( lambda e: e.is_Pow and e.exp.is_integer and e.exp == 2, @@ -3666,44 +3667,42 @@ T x1 = f_curr[3] + f_curr[6]; T x2 = x0 + x1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8]; T x3 = f_curr[0] - f_curr[8]; T x4 = T{1} / (x2); -T x10 = T{72.0000000000000}*f_curr[2]; -T x11 = T{72.0000000000000}*f_curr[6]; +T x9 = T{72.0000000000000}*f_curr[2]; +T x10 = T{72.0000000000000}*f_curr[6]; T rho = x2; -T x31 = T{4.00000000000000}*rho; +T x29 = T{4.00000000000000}*rho; T u_0 = -x4*(x0 + x3 - f_curr[6] - f_curr[7]); -T x6 = u_0*u_0; -T x13 = -T{3.00000000000000}*x6; -T x16 = T{6.00000000000000}*u_0; -T x17 = -x16; -T x30 = T{0.0277777777777778}*u_0; +T x5 = u_0*u_0; +T x12 = -T{3.00000000000000}*x5; +T x15 = T{6.00000000000000}*u_0; +T x16 = -x15; T u_1 = -x4*(x1 + x3 - f_curr[2] - f_curr[5]); -T x5 = T{0.0277777777777778}*u_1; -T x7 = u_1*u_1; -T x8 = x6 + x7; -T x9 = pow(x8, T{-0.500000000000000}); -T x12 = -u_0 + u_1; -T x14 = T{6.00000000000000}*u_1; -T x15 = x13 + x14; -T x18 = T{2.00000000000000} - T{3.00000000000000}*x7; -T x19 = x17 + x18; -T x20 = rho*(x15 + x19 + T{9.00000000000000}*(x12*x12)); -T x21 = u_0 - u_1; -T x22 = x13 - x14; -T x23 = x16 + x18; -T x24 = rho*(x22 + x23 + T{9.00000000000000}*(x21*x21)); -T x25 = u_0 + u_1; -T x26 = T{9.00000000000000}*(x25*x25); -T x27 = rho*(x15 + x23 + x26) + rho*(x19 + x22 + x26) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8]; -T x28 = x10 + x11 - x20 - x24 + x27; -T x29 = x28*x9; -T x32 = x18 + T{6.00000000000000}*x6; -T x33 = -x10 - x11 + x20 + x24 + x27; -T x34 = x31*(x16 + x32) + x31*(x17 + x32) + x33 - T{72.0000000000000}*f_curr[1] - T{72.0000000000000}*f_curr[7]; -T x35 = T{6.00000000000000}*x7 + T{2.00000000000000}; -T x36 = x31*(x15 + x35) + x31*(x22 + x35) + x33 - T{72.0000000000000}*f_curr[3] - T{72.0000000000000}*f_curr[5]; -T x37 = ((x28*u_0 + x36*u_1)*u_1 + (x28*u_1 + x34*u_0)*u_0)/x8; -T n_0 = -x29*x5 - x30*x34*x9 + x30*x37; -T n_1 = -x29*x30 - x36*x5*x9 + x37*x5; +T x6 = u_1*u_1; +T x7 = x5 + x6; +T x8 = pow(x7, T{-0.500000000000000}); +T x11 = -u_0 + u_1; +T x13 = T{6.00000000000000}*u_1; +T x14 = x12 + x13; +T x17 = T{2.00000000000000} - T{3.00000000000000}*x6; +T x18 = x16 + x17; +T x19 = rho*(x14 + x18 + T{9.00000000000000}*(x11*x11)); +T x20 = u_0 - u_1; +T x21 = x12 - x13; +T x22 = x15 + x17; +T x23 = rho*(x21 + x22 + T{9.00000000000000}*(x20*x20)); +T x24 = u_0 + u_1; +T x25 = T{9.00000000000000}*(x24*x24); +T x26 = rho*(x14 + x22 + x25) + rho*(x18 + x21 + x25) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8]; +T x27 = x10 - x19 - x23 + x26 + x9; +T x28 = x27*x8; +T x30 = x17 + T{6.00000000000000}*x5; +T x31 = -x10 + x19 + x23 + x26 - x9; +T x32 = x29*(x15 + x30) + x29*(x16 + x30) + x31 - T{72.0000000000000}*f_curr[1] - T{72.0000000000000}*f_curr[7]; +T x33 = T{6.00000000000000}*x6 + T{2.00000000000000}; +T x34 = x29*(x14 + x33) + x29*(x21 + x33) + x31 - T{72.0000000000000}*f_curr[3] - T{72.0000000000000}*f_curr[5]; +T x35 = ((x27*u_0 + x34*u_1)*u_1 + (x27*u_1 + x32*u_0)*u_0)/x7; +T n_0 = -T{0.0277777777777778}*x28*u_1 - T{0.0277777777777778}*x32*x8*u_0 + T{0.0277777777777778}*x35*u_0; +T n_1 = -T{0.0277777777777778}*x28*u_0 - T{0.0277777777777778}*x34*x8*u_1 + T{0.0277777777777778}*x35*u_1; #+end_example *** Determine shear layer visibility @@ -3877,11 +3876,14 @@ T x17 = u_0 + u_1; T x18 = T{9.00000000000000}*(x17*x17); T x19 = x11 + x9 + T{-2.00000000000000}; T x20 = rho*(x14 + x18 + x2) - rho*(-x18 + x19 + x2 + x7) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8]; -T x21 = T{4.00000000000000}*rho; -T x22 = T{6.00000000000000}*x10 + x8; -T x23 = -x0 - x1 + x15 + x16 + x20; -T x24 = T{6.00000000000000}*x6; -T strain = T{0.0277777777777778}*sqrt((x0 + x1 - x15 - x16 + x20)*(x0 + x1 - x15 - x16 + x20) + T{0.500000000000000}*((-x21*(x19 - x24) + x21*(x13 + x24 + 2) + x23 - 72*f_curr[3] - 72*f_curr[5])*(-x21*(x19 - x24) + x21*(x13 + x24 + 2) + x23 - 72*f_curr[3] - 72*f_curr[5])) + T{0.500000000000000}*((x21*(x2 + x22) + x21*(x22 + x3) + x23 - 72*f_curr[1] - 72*f_curr[7])*(x21*(x2 + x22) + x21*(x22 + x3) + x23 - 72*f_curr[1] - 72*f_curr[7]))); +T x21 = x0 + x1 - x15 - x16 + x20; +T x22 = T{4.00000000000000}*rho; +T x23 = T{6.00000000000000}*x10 + x8; +T x24 = -x0 - x1 + x15 + x16 + x20; +T x25 = x22*(x2 + x23) + x22*(x23 + x3) + x24 - T{72.0000000000000}*f_curr[1] - T{72.0000000000000}*f_curr[7]; +T x26 = T{6.00000000000000}*x6; +T x27 = -x22*(x19 - x26) + x22*(x13 + x26 + T{2.00000000000000}) + x24 - T{72.0000000000000}*f_curr[3] - T{72.0000000000000}*f_curr[5]; +T strain = T{0.0277777777777778}*sqrt(x21*x21 + T{0.500000000000000}*(x25*x25) + T{0.500000000000000}*(x27*x27)); #+end_example #+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/collect_q_criterion.h @@ -5788,21 +5790,29 @@ nvidia-smi --query-gpu=name --format=csv,noheader #+END_SRC #+RESULTS: -: GeForce RTX 2070 +: GeForce RTX 3070 #+NAME: benchmark-ldc -#+BEGIN_SRC bash :dir build :eval query :var min=64 :var max=128 :var step=16 :var nSteps=1000 :async t +#+BEGIN_SRC bash :dir build :eval query :var min=64 :var max=256 :var step=16 :var nSteps=1000 :async t for n in $(seq $min $step $max); do ./benchmark-ldc $n $nSteps done #+END_SRC #+RESULTS: benchmark-ldc -| 4 | 64 | 1000 | 2299.85 | -| 4 | 80 | 1000 | 2321.52 | -| 4 | 96 | 1000 | 2456.87 | -| 4 | 112 | 1000 | 2456.11 | -| 4 | 128 | 1000 | 2462.46 | +| 4 | 64 | 1000 | 2416.56 | +| 4 | 80 | 1000 | 2471.92 | +| 4 | 96 | 1000 | 2534.33 | +| 4 | 112 | 1000 | 2512.18 | +| 4 | 128 | 1000 | 2569.58 | +| 4 | 144 | 1000 | 2541.29 | +| 4 | 160 | 1000 | 2599.92 | +| 4 | 176 | 1000 | 2499.82 | +| 4 | 192 | 1000 | 2513.63 | +| 4 | 208 | 1000 | 2492.54 | +| 4 | 224 | 1000 | 2533.04 | +| 4 | 240 | 1000 | 2561.12 | +| 4 | 256 | 1000 | 2511.97 | * Open tasks :properties: diff --git a/tangle/LLBM/kernel/collect_q_criterion.h b/tangle/LLBM/kernel/collect_q_criterion.h index 19b7f68..fa19dc7 100644 --- a/tangle/LLBM/kernel/collect_q_criterion.h +++ b/tangle/LLBM/kernel/collect_q_criterion.h @@ -56,39 +56,45 @@ __device__ static void apply( T x28 = -x13; T x29 = x2 + x28; T x30 = -rho*(x14 + x21 - x24) + rho*(x14 + x24 + x27 + x29) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7]; - T x31 = T{72.0000000000000}*f_curr[1]; - T x32 = T{72.0000000000000}*f_curr[17]; - T x33 = x4 + u_2; - T x34 = T{6.00000000000000}*u_2; - T x35 = x11 - x34; - T x36 = rho*(x15 + x35 - T{9.00000000000000}*x33*x33); - T x37 = -u_2; - T x38 = x37 + u_0; - T x39 = x11 + x34; - T x40 = x13 + x39; - T x41 = rho*(x17 + x40 - T{9.00000000000000}*x38*x38); - T x42 = u_0 + u_2; - T x43 = T{9.00000000000000}*(x42*x42); - T x44 = x27 + x34; - T x45 = x14 + x28; - T x46 = -rho*(x15 + x39 - x43) + rho*(x43 + x44 + x45) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3]; - T x47 = T{72.0000000000000}*f_curr[4]; - T x48 = T{72.0000000000000}*f_curr[14]; - T x49 = x18 + u_2; - T x50 = rho*(x20 + x35 - T{9.00000000000000}*x49*x49); - T x51 = x37 + u_1; - T x52 = rho*(x3 + x40 - T{9.00000000000000}*x51*x51); - T x53 = u_1 + u_2; - T x54 = T{9.00000000000000}*(x53*x53); - T x55 = -rho*(x20 + x39 - x54) + rho*(x29 + x44 + x54) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18]; - T x56 = T{2.00000000000000}*rho; - T x57 = T{6.00000000000000}*x8; - T x58 = -x31 - x32 - x36 - x41 + x46; - T x59 = -x0 - x1 - x16 - x22 + x30; - T x60 = T{6.00000000000000}*x6; - T x61 = -x47 - x48 - x50 - x52 + x55; - T x62 = T{6.00000000000000}*x12; - T strain = T{0.0277777777777778}*sqrt((x0 + x1 + x16 + x22 + x30)*(x0 + x1 + x16 + x22 + x30) + (x31 + x32 + x36 + x41 + x46)*(x31 + x32 + x36 + x41 + x46) + (x47 + x48 + x50 + x52 + x55)*(x47 + x48 + x50 + x52 + x55) + T{0.500000000000000}*((-x56*(x39 - x62) + x56*(x44 + x62) + x58 + x61 - 72*f_curr[16] - 72*f_curr[2])*(-x56*(x39 - x62) + x56*(x44 + x62) + x58 + x61 - 72*f_curr[16] - 72*f_curr[2])) + T{0.500000000000000}*((-x56*(x10 + x20 - x60) + x56*(x26 + x29 + x60) + x59 + x61 - 72*f_curr[12] - 72*f_curr[6])*(-x56*(x10 + x20 - x60) + x56*(x26 + x29 + x60) + x59 + x61 - 72*f_curr[12] - 72*f_curr[6])) + T{0.500000000000000}*((-x56*(x15 - x57 + x7 - 2) + x56*(x25 + x45 + x57 + 2) + x58 + x59 - 72*f_curr[10] - 72*f_curr[8])*(-x56*(x15 - x57 + x7 - 2) + x56*(x25 + x45 + x57 + 2) + x58 + x59 - 72*f_curr[10] - 72*f_curr[8]))); + T x31 = x0 + x1 + x16 + x22 + x30; + T x32 = T{72.0000000000000}*f_curr[1]; + T x33 = T{72.0000000000000}*f_curr[17]; + T x34 = x4 + u_2; + T x35 = T{6.00000000000000}*u_2; + T x36 = x11 - x35; + T x37 = rho*(x15 + x36 - T{9.00000000000000}*x34*x34); + T x38 = -u_2; + T x39 = x38 + u_0; + T x40 = x11 + x35; + T x41 = x13 + x40; + T x42 = rho*(x17 + x41 - T{9.00000000000000}*x39*x39); + T x43 = u_0 + u_2; + T x44 = T{9.00000000000000}*(x43*x43); + T x45 = x27 + x35; + T x46 = x14 + x28; + T x47 = -rho*(x15 + x40 - x44) + rho*(x44 + x45 + x46) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3]; + T x48 = x32 + x33 + x37 + x42 + x47; + T x49 = T{72.0000000000000}*f_curr[4]; + T x50 = T{72.0000000000000}*f_curr[14]; + T x51 = x18 + u_2; + T x52 = rho*(x20 + x36 - T{9.00000000000000}*x51*x51); + T x53 = x38 + u_1; + T x54 = rho*(x3 + x41 - T{9.00000000000000}*x53*x53); + T x55 = u_1 + u_2; + T x56 = T{9.00000000000000}*(x55*x55); + T x57 = -rho*(x20 + x40 - x56) + rho*(x29 + x45 + x56) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18]; + T x58 = x49 + x50 + x52 + x54 + x57; + T x59 = T{2.00000000000000}*rho; + T x60 = T{6.00000000000000}*x8; + T x61 = -x32 - x33 - x37 - x42 + x47; + T x62 = -x0 - x1 - x16 - x22 + x30; + T x63 = -x59*(x15 - x60 + x7 + T{-2.00000000000000}) + x59*(x25 + x46 + x60 + T{2.00000000000000}) + x61 + x62 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8]; + T x64 = T{6.00000000000000}*x6; + T x65 = -x49 - x50 - x52 - x54 + x57; + T x66 = -x59*(x10 + x20 - x64) + x59*(x26 + x29 + x64) + x62 + x65 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6]; + T x67 = T{6.00000000000000}*x12; + T x68 = -x59*(x40 - x67) + x59*(x45 + x67) + x61 + x65 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2]; + T strain = T{0.0277777777777778}*sqrt(x31*x31 + x48*x48 + x58*x58 + T{0.500000000000000}*(x63*x63) + T{0.500000000000000}*(x66*x66) + T{0.500000000000000}*(x68*x68)); float vorticity = cell_curl_norm[gid]; float q = vorticity*vorticity - strain*strain; diff --git a/tangle/LLBM/kernel/collect_shear_layer_normal.h b/tangle/LLBM/kernel/collect_shear_layer_normal.h index 7bf6eff..13e839a 100644 --- a/tangle/LLBM/kernel/collect_shear_layer_normal.h +++ b/tangle/LLBM/kernel/collect_shear_layer_normal.h @@ -22,85 +22,82 @@ __device__ static void apply( T x5 = -f_curr[15] + f_curr[3]; T x6 = T{1} / (x3); T x7 = f_curr[0] - f_curr[18]; - T x14 = T{72.0000000000000}*f_curr[5]; - T x15 = T{72.0000000000000}*f_curr[13]; - T x39 = T{72.0000000000000}*f_curr[1]; - T x40 = T{72.0000000000000}*f_curr[17]; - T x61 = T{72.0000000000000}*f_curr[4]; - T x62 = T{72.0000000000000}*f_curr[14]; + T x13 = T{72.0000000000000}*f_curr[5]; + T x14 = T{72.0000000000000}*f_curr[13]; + T x37 = T{72.0000000000000}*f_curr[1]; + T x38 = T{72.0000000000000}*f_curr[17]; + T x58 = T{72.0000000000000}*f_curr[4]; + T x59 = T{72.0000000000000}*f_curr[14]; T rho = x3; - T x56 = T{2.00000000000000}*rho; + T x53 = T{2.00000000000000}*rho; T u_0 = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]); - T x9 = u_0*u_0; - T x16 = -u_0; - T x18 = -T{3.00000000000000}*x9; - T x21 = T{6.00000000000000}*u_0; - T x22 = -x21; - T x55 = T{0.0277777777777778}*u_0; + T x8 = u_0*u_0; + T x15 = -u_0; + T x17 = -T{3.00000000000000}*x8; + T x20 = T{6.00000000000000}*u_0; + T x21 = -x20; T u_1 = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]); - T x8 = T{0.0277777777777778}*u_1; - T x10 = u_1*u_1; - T x17 = x16 + u_1; - T x19 = T{6.00000000000000}*u_1; - T x20 = x18 + x19; - T x23 = -T{3.00000000000000}*x10; - T x28 = -u_1; - T x29 = x28 + u_0; - T x30 = x18 - x19; - T x33 = u_0 + u_1; - T x34 = T{9.00000000000000}*(x33*x33); + T x9 = u_1*u_1; + T x16 = x15 + u_1; + T x18 = T{6.00000000000000}*u_1; + T x19 = x17 + x18; + T x22 = -T{3.00000000000000}*x9; + T x27 = -u_1; + T x28 = x27 + u_0; + T x29 = x17 - x18; + T x32 = u_0 + u_1; + T x33 = T{9.00000000000000}*(x32*x32); T u_2 = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]); - T x11 = u_2*u_2; - T x12 = x10 + x11 + x9; - T x13 = pow(x12, T{-0.500000000000000}); - T x24 = T{2.00000000000000} - T{3.00000000000000}*x11; - T x25 = x23 + x24; - T x26 = x22 + x25; - T x27 = rho*(x20 + x26 + T{9.00000000000000}*(x17*x17)); - T x31 = x21 + x25; - T x32 = rho*(x30 + x31 + T{9.00000000000000}*(x29*x29)); - T x35 = rho*(x20 + x31 + x34) + rho*(x26 + x30 + x34) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7]; - T x36 = x14 + x15 - x27 - x32 + x35; - T x37 = x13*x36; - T x38 = T{0.0277777777777778}*u_2; - T x41 = x16 + u_2; - T x42 = T{6.00000000000000}*u_2; - T x43 = x18 + x42; - T x44 = rho*(x26 + x43 + T{9.00000000000000}*(x41*x41)); - T x45 = -u_2; - T x46 = x45 + u_0; - T x47 = -x42; - T x48 = x18 + x47; - T x49 = rho*(x31 + x48 + T{9.00000000000000}*(x46*x46)); - T x50 = u_0 + u_2; - T x51 = T{9.00000000000000}*(x50*x50); - T x52 = rho*(x26 + x48 + x51) + rho*(x31 + x43 + x51) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3]; - T x53 = x39 + x40 - x44 - x49 + x52; - T x54 = x13*x53; - T x57 = x25 + T{6.00000000000000}*x9; - T x58 = -x14 - x15 + x27 + x32 + x35; - T x59 = -x39 - x40 + x44 + x49 + x52; - T x60 = x56*(x21 + x57) + x56*(x22 + x57) + x58 + x59 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8]; - T x63 = x28 + u_2; - T x64 = x25 + x30; - T x65 = rho*(x42 + x64 + T{9.00000000000000}*(x63*x63)); - T x66 = x45 + u_1; - T x67 = x20 + x25; - T x68 = rho*(x47 + x67 + T{9.00000000000000}*(x66*x66)); - T x69 = u_1 + u_2; - T x70 = T{9.00000000000000}*(x69*x69); - T x71 = rho*(x42 + x67 + x70) + rho*(x47 + x64 + x70) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18]; - T x72 = x61 + x62 - x65 - x68 + x71; - T x73 = T{6.00000000000000}*x10 + x24; - T x74 = -x61 - x62 + x65 + x68 + x71; - T x75 = x56*(x20 + x73) + x56*(x30 + x73) + x58 + x74 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6]; - T x76 = T{6.00000000000000}*x11 + x23 + T{2.00000000000000}; - T x77 = x56*(x43 + x76) + x56*(x48 + x76) + x59 + x74 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2]; - T x78 = ((x36*u_0 + x72*u_2 + x75*u_1)*u_1 + (x36*u_1 + x53*u_2 + x60*u_0)*u_0 + (x53*u_0 + x72*u_1 + x77*u_2)*u_2)/x12; - T x79 = x13*x72; - T n_0 = -x13*x55*x60 - x37*x8 - x38*x54 + x55*x78; - T n_1 = -x13*x75*x8 - x37*x55 - x38*x79 + x78*x8; - T n_2 = -x13*x38*x77 + x38*x78 - x54*x55 - x79*x8; + T x10 = u_2*u_2; + T x11 = x10 + x8 + x9; + T x12 = pow(x11, T{-0.500000000000000}); + T x23 = T{2.00000000000000} - T{3.00000000000000}*x10; + T x24 = x22 + x23; + T x25 = x21 + x24; + T x26 = rho*(x19 + x25 + T{9.00000000000000}*(x16*x16)); + T x30 = x20 + x24; + T x31 = rho*(x29 + x30 + T{9.00000000000000}*(x28*x28)); + T x34 = rho*(x19 + x30 + x33) + rho*(x25 + x29 + x33) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7]; + T x35 = x13 + x14 - x26 - x31 + x34; + T x36 = x12*x35; + T x39 = x15 + u_2; + T x40 = T{6.00000000000000}*u_2; + T x41 = x17 + x40; + T x42 = rho*(x25 + x41 + T{9.00000000000000}*(x39*x39)); + T x43 = -u_2; + T x44 = x43 + u_0; + T x45 = -x40; + T x46 = x17 + x45; + T x47 = rho*(x30 + x46 + T{9.00000000000000}*(x44*x44)); + T x48 = u_0 + u_2; + T x49 = T{9.00000000000000}*(x48*x48); + T x50 = rho*(x25 + x46 + x49) + rho*(x30 + x41 + x49) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3]; + T x51 = x37 + x38 - x42 - x47 + x50; + T x52 = x12*x51; + T x54 = x24 + T{6.00000000000000}*x8; + T x55 = -x13 - x14 + x26 + x31 + x34; + T x56 = -x37 - x38 + x42 + x47 + x50; + T x57 = x53*(x20 + x54) + x53*(x21 + x54) + x55 + x56 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8]; + T x60 = x27 + u_2; + T x61 = x24 + x29; + T x62 = rho*(x40 + x61 + T{9.00000000000000}*(x60*x60)); + T x63 = x43 + u_1; + T x64 = x19 + x24; + T x65 = rho*(x45 + x64 + T{9.00000000000000}*(x63*x63)); + T x66 = u_1 + u_2; + T x67 = T{9.00000000000000}*(x66*x66); + T x68 = rho*(x40 + x64 + x67) + rho*(x45 + x61 + x67) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18]; + T x69 = x58 + x59 - x62 - x65 + x68; + T x70 = x23 + T{6.00000000000000}*x9; + T x71 = -x58 - x59 + x62 + x65 + x68; + T x72 = x53*(x19 + x70) + x53*(x29 + x70) + x55 + x71 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6]; + T x73 = T{6.00000000000000}*x10 + x22 + T{2.00000000000000}; + T x74 = x53*(x41 + x73) + x53*(x46 + x73) + x56 + x71 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2]; + T x75 = ((x35*u_0 + x69*u_2 + x72*u_1)*u_1 + (x35*u_1 + x51*u_2 + x57*u_0)*u_0 + (x51*u_0 + x69*u_1 + x74*u_2)*u_2)/x11; + T x76 = x12*x69; + T n_0 = -T{0.0277777777777778}*x12*x57*u_0 - T{0.0277777777777778}*x36*u_1 - T{0.0277777777777778}*x52*u_2 + T{0.0277777777777778}*x75*u_0; + T n_1 = -T{0.0277777777777778}*x12*x72*u_1 - T{0.0277777777777778}*x36*u_0 + T{0.0277777777777778}*x75*u_1 - T{0.0277777777777778}*x76*u_2; + T n_2 = -T{0.0277777777777778}*x12*x74*u_2 - T{0.0277777777777778}*x52*u_0 + T{0.0277777777777778}*x75*u_2 - T{0.0277777777777778}*x76*u_1; cell_rho[gid] = rho; diff --git a/tangle/LLBM/kernel/smagorinsky_collide.h b/tangle/LLBM/kernel/smagorinsky_collide.h index 3489479..a7355c5 100644 --- a/tangle/LLBM/kernel/smagorinsky_collide.h +++ b/tangle/LLBM/kernel/smagorinsky_collide.h @@ -75,13 +75,13 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std: T x7 = f_curr[0] - f_curr[18]; T x8 = T{72.0000000000000}*f_curr[5]; T x9 = T{72.0000000000000}*f_curr[13]; - T x42 = T{72.0000000000000}*f_curr[1]; - T x43 = T{72.0000000000000}*f_curr[17]; - T x61 = T{72.0000000000000}*f_curr[4]; - T x62 = T{72.0000000000000}*f_curr[14]; + T x43 = T{72.0000000000000}*f_curr[1]; + T x44 = T{72.0000000000000}*f_curr[17]; + T x63 = T{72.0000000000000}*f_curr[4]; + T x64 = T{72.0000000000000}*f_curr[14]; T rho = x3; - T x74 = T{2.00000000000000}*rho; - T x89 = T{2.00000000000000}*rho; + T x77 = T{2.00000000000000}*rho; + T x95 = T{2.00000000000000}*rho; T u_0 = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]); T x12 = -u_0; T x14 = T{6.00000000000000}*u_0; @@ -89,7 +89,7 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std: T x18 = T{3.00000000000000}*x17; T x26 = -x14; T x35 = T{2.00000000000000} - x18; - T x75 = T{6.00000000000000}*x17; + T x78 = T{6.00000000000000}*x17; T u_1 = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]); T x10 = T{6.00000000000000}*u_1; T x11 = -x10; @@ -103,7 +103,7 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std: T x33 = T{9.00000000000000}*(x32*x32); T x34 = -x20; T x36 = x34 + x35; - T x81 = T{6.00000000000000}*x19; + T x85 = T{6.00000000000000}*x19; T u_2 = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]); T x15 = u_2*u_2; T x16 = T{3.00000000000000}*x15; @@ -119,65 +119,71 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std: T x39 = x14 + x33 + x36 + x38; T x40 = x14 + x29 - x33; T x41 = rho*x39 - rho*x40 - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7]; - T x44 = T{6.00000000000000}*u_2; - T x45 = -x44; - T x46 = x12 + u_2; - T x47 = x23 + x45 - T{9.00000000000000}*x46*x46; - T x48 = rho*x47; - T x49 = -u_2; - T x50 = x49 + u_0; - T x51 = x22 + x44; - T x52 = x26 + x51 - T{9.00000000000000}*x50*x50; - T x53 = rho*x52; - T x54 = u_0 + u_2; - T x55 = T{9.00000000000000}*(x54*x54); - T x56 = x36 + x44; - T x57 = x14 + x37; - T x58 = x55 + x56 + x57; - T x59 = x14 + x51 - x55; - T x60 = rho*x58 - rho*x59 - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3]; - T x63 = x27 + u_2; - T x64 = x29 + x45 - T{9.00000000000000}*x63*x63; - T x65 = rho*x64; - T x66 = x49 + u_1; - T x67 = x11 + x51 - T{9.00000000000000}*x66*x66; - T x68 = rho*x67; - T x69 = u_1 + u_2; - T x70 = T{9.00000000000000}*(x69*x69); - T x71 = x38 + x56 + x70; - T x72 = x29 + x44 - x70; - T x73 = rho*x71 - rho*x72 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18]; - T x76 = x16 + T{-2.00000000000000}; - T x77 = x14 + x20 - x75 + x76; - T x78 = x34 + x57 + x75 + T{2.00000000000000}; - T x79 = -x42 - x43 - x48 - x53 + x60; - T x80 = -x25 - x31 + x41 - x8 - x9; - T x82 = x10 + x18 + x76 - x81; - T x83 = x35 + x38 + x81; - T x84 = -x61 - x62 - x65 - x68 + x73; - T x85 = T{6.00000000000000}*x15; - T x86 = x21 + x44 - x85; - T x87 = x56 + x85; - T x88 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt((x25 + x31 + x41 + x8 + x9)*(x25 + x31 + x41 + x8 + x9) + (x42 + x43 + x48 + x53 + x60)*(x42 + x43 + x48 + x53 + x60) + (x61 + x62 + x65 + x68 + x73)*(x61 + x62 + x65 + x68 + x73) + T{0.500000000000000}*((-x74*x77 + x74*x78 + x79 + x80 - 72*f_curr[10] - 72*f_curr[8])*(-x74*x77 + x74*x78 + x79 + x80 - 72*f_curr[10] - 72*f_curr[8])) + T{0.500000000000000}*((-x74*x82 + x74*x83 + x80 + x84 - 72*f_curr[12] - 72*f_curr[6])*(-x74*x82 + x74*x83 + x80 + x84 - 72*f_curr[12] - 72*f_curr[6])) + T{0.500000000000000}*((-x74*x86 + x74*x87 + x79 + x84 - 72*f_curr[16] - 72*f_curr[2])*(-x74*x86 + x74*x87 + x79 + x84 - 72*f_curr[16] - 72*f_curr[2]))) + tau*tau)); - f_next[0] = T{0.0138888888888889}*x88*(x71*x89 - T{144.000000000000}*f_curr[0]) + f_curr[0]; - f_next[1] = -T{0.0138888888888889}*x88*(x47*x89 + T{144.000000000000}*f_curr[1]) + f_curr[1]; - f_next[2] = T{0.0277777777777778}*x88*(x87*x89 - T{72.0000000000000}*f_curr[2]) + f_curr[2]; - f_next[3] = T{0.0138888888888889}*x88*(x58*x89 - T{144.000000000000}*f_curr[3]) + f_curr[3]; - f_next[4] = -T{0.0138888888888889}*x88*(x64*x89 + T{144.000000000000}*f_curr[4]) + f_curr[4]; - f_next[5] = -T{0.0138888888888889}*x88*(x24*x89 + T{144.000000000000}*f_curr[5]) + f_curr[5]; - f_next[6] = T{0.0277777777777778}*x88*(x83*x89 - T{72.0000000000000}*f_curr[6]) + f_curr[6]; - f_next[7] = T{0.0138888888888889}*x88*(x39*x89 - T{144.000000000000}*f_curr[7]) + f_curr[7]; - f_next[8] = -T{0.0277777777777778}*x88*(x77*x89 + T{72.0000000000000}*f_curr[8]) + f_curr[8]; - f_next[9] = -T{0.166666666666667}*x88*(x22*x89 + T{12.0000000000000}*f_curr[9]) + f_curr[9]; - f_next[10] = T{0.0277777777777778}*x88*(x78*x89 - T{72.0000000000000}*f_curr[10]) + f_curr[10]; - f_next[11] = -T{0.0138888888888889}*x88*(x40*x89 + T{144.000000000000}*f_curr[11]) + f_curr[11]; - f_next[12] = -T{0.0277777777777778}*x88*(x82*x89 + T{72.0000000000000}*f_curr[12]) + f_curr[12]; - f_next[13] = -T{0.0138888888888889}*x88*(x30*x89 + T{144.000000000000}*f_curr[13]) + f_curr[13]; - f_next[14] = -T{0.0138888888888889}*x88*(x67*x89 + T{144.000000000000}*f_curr[14]) + f_curr[14]; - f_next[15] = -T{0.0138888888888889}*x88*(x59*x89 + T{144.000000000000}*f_curr[15]) + f_curr[15]; - f_next[16] = -T{0.0277777777777778}*x88*(x86*x89 + T{72.0000000000000}*f_curr[16]) + f_curr[16]; - f_next[17] = -T{0.0138888888888889}*x88*(x52*x89 + T{144.000000000000}*f_curr[17]) + f_curr[17]; - f_next[18] = -T{0.0138888888888889}*x88*(x72*x89 + T{144.000000000000}*f_curr[18]) + f_curr[18]; + T x42 = x25 + x31 + x41 + x8 + x9; + T x45 = T{6.00000000000000}*u_2; + T x46 = -x45; + T x47 = x12 + u_2; + T x48 = x23 + x46 - T{9.00000000000000}*x47*x47; + T x49 = rho*x48; + T x50 = -u_2; + T x51 = x50 + u_0; + T x52 = x22 + x45; + T x53 = x26 + x52 - T{9.00000000000000}*x51*x51; + T x54 = rho*x53; + T x55 = u_0 + u_2; + T x56 = T{9.00000000000000}*(x55*x55); + T x57 = x36 + x45; + T x58 = x14 + x37; + T x59 = x56 + x57 + x58; + T x60 = x14 + x52 - x56; + T x61 = rho*x59 - rho*x60 - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3]; + T x62 = x43 + x44 + x49 + x54 + x61; + T x65 = x27 + u_2; + T x66 = x29 + x46 - T{9.00000000000000}*x65*x65; + T x67 = rho*x66; + T x68 = x50 + u_1; + T x69 = x11 + x52 - T{9.00000000000000}*x68*x68; + T x70 = rho*x69; + T x71 = u_1 + u_2; + T x72 = T{9.00000000000000}*(x71*x71); + T x73 = x38 + x57 + x72; + T x74 = x29 + x45 - x72; + T x75 = rho*x73 - rho*x74 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18]; + T x76 = x63 + x64 + x67 + x70 + x75; + T x79 = x16 + T{-2.00000000000000}; + T x80 = x14 + x20 - x78 + x79; + T x81 = x34 + x58 + x78 + T{2.00000000000000}; + T x82 = -x43 - x44 - x49 - x54 + x61; + T x83 = -x25 - x31 + x41 - x8 - x9; + T x84 = -x77*x80 + x77*x81 + x82 + x83 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8]; + T x86 = x10 + x18 + x79 - x85; + T x87 = x35 + x38 + x85; + T x88 = -x63 - x64 - x67 - x70 + x75; + T x89 = -x77*x86 + x77*x87 + x83 + x88 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6]; + T x90 = T{6.00000000000000}*x15; + T x91 = x21 + x45 - x90; + T x92 = x57 + x90; + T x93 = -x77*x91 + x77*x92 + x82 + x88 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2]; + T x94 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt(x42*x42 + x62*x62 + x76*x76 + T{0.500000000000000}*(x84*x84) + T{0.500000000000000}*(x89*x89) + T{0.500000000000000}*(x93*x93)) + tau*tau)); + f_next[0] = T{0.0138888888888889}*x94*(x73*x95 - T{144.000000000000}*f_curr[0]) + f_curr[0]; + f_next[1] = -T{0.0138888888888889}*x94*(x48*x95 + T{144.000000000000}*f_curr[1]) + f_curr[1]; + f_next[2] = T{0.0277777777777778}*x94*(x92*x95 - T{72.0000000000000}*f_curr[2]) + f_curr[2]; + f_next[3] = T{0.0138888888888889}*x94*(x59*x95 - T{144.000000000000}*f_curr[3]) + f_curr[3]; + f_next[4] = -T{0.0138888888888889}*x94*(x66*x95 + T{144.000000000000}*f_curr[4]) + f_curr[4]; + f_next[5] = -T{0.0138888888888889}*x94*(x24*x95 + T{144.000000000000}*f_curr[5]) + f_curr[5]; + f_next[6] = T{0.0277777777777778}*x94*(x87*x95 - T{72.0000000000000}*f_curr[6]) + f_curr[6]; + f_next[7] = T{0.0138888888888889}*x94*(x39*x95 - T{144.000000000000}*f_curr[7]) + f_curr[7]; + f_next[8] = -T{0.0277777777777778}*x94*(x80*x95 + T{72.0000000000000}*f_curr[8]) + f_curr[8]; + f_next[9] = -T{0.166666666666667}*x94*(x22*x95 + T{12.0000000000000}*f_curr[9]) + f_curr[9]; + f_next[10] = T{0.0277777777777778}*x94*(x81*x95 - T{72.0000000000000}*f_curr[10]) + f_curr[10]; + f_next[11] = -T{0.0138888888888889}*x94*(x40*x95 + T{144.000000000000}*f_curr[11]) + f_curr[11]; + f_next[12] = -T{0.0277777777777778}*x94*(x86*x95 + T{72.0000000000000}*f_curr[12]) + f_curr[12]; + f_next[13] = -T{0.0138888888888889}*x94*(x30*x95 + T{144.000000000000}*f_curr[13]) + f_curr[13]; + f_next[14] = -T{0.0138888888888889}*x94*(x69*x95 + T{144.000000000000}*f_curr[14]) + f_curr[14]; + f_next[15] = -T{0.0138888888888889}*x94*(x60*x95 + T{144.000000000000}*f_curr[15]) + f_curr[15]; + f_next[16] = -T{0.0277777777777778}*x94*(x91*x95 + T{72.0000000000000}*f_curr[16]) + f_curr[16]; + f_next[17] = -T{0.0138888888888889}*x94*(x53*x95 + T{144.000000000000}*f_curr[17]) + f_curr[17]; + f_next[18] = -T{0.0138888888888889}*x94*(x74*x95 + T{144.000000000000}*f_curr[18]) + f_curr[18]; } }; |