summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Kummerlaender2021-06-10 18:47:13 +0200
committerAdrian Kummerlaender2021-06-10 18:47:13 +0200
commitad2f632fb355c9bc91246552d97c7c1a4304ec99 (patch)
treeb499810b669b18dd07c4ed4451df81adb0bce120
parent4ec94c97879aafef15f7663135745e4ba61e62cf (diff)
downloadLiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.tar
LiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.tar.gz
LiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.tar.bz2
LiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.tar.lz
LiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.tar.xz
LiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.tar.zst
LiterateLB-ad2f632fb355c9bc91246552d97c7c1a4304ec99.zip
Update GPU results of new system
-rw-r--r--CMakeLists.txt2
-rw-r--r--lbm.org104
-rw-r--r--tangle/LLBM/kernel/collect_q_criterion.h72
-rw-r--r--tangle/LLBM/kernel/collect_shear_layer_normal.h147
-rw-r--r--tangle/LLBM/kernel/smagorinsky_collide.h140
5 files changed, 242 insertions, 223 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index abe1087..ba28533 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream per-thread --extended-lambda")
-set(CMAKE_CUDA_ARCHITECTURES 50)
+set(CMAKE_CUDA_ARCHITECTURES 62)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/tangle
diff --git a/lbm.org b/lbm.org
index 6df9a50..7732c3a 100644
--- a/lbm.org
+++ b/lbm.org
@@ -270,6 +270,7 @@ a custom =ReplaceOptim= structure during the CSE optimization step that conditio
#+BEGIN_SRC python :session :results none
from sympy.codegen.rewriting import ReplaceOptim
+from sympy.simplify import cse_main
expand_pos_square = ReplaceOptim(
lambda e: e.is_Pow and e.exp.is_integer and e.exp == 2,
@@ -3666,44 +3667,42 @@ T x1 = f_curr[3] + f_curr[6];
T x2 = x0 + x1 + f_curr[0] + f_curr[4] + f_curr[5] + f_curr[7] + f_curr[8];
T x3 = f_curr[0] - f_curr[8];
T x4 = T{1} / (x2);
-T x10 = T{72.0000000000000}*f_curr[2];
-T x11 = T{72.0000000000000}*f_curr[6];
+T x9 = T{72.0000000000000}*f_curr[2];
+T x10 = T{72.0000000000000}*f_curr[6];
T rho = x2;
-T x31 = T{4.00000000000000}*rho;
+T x29 = T{4.00000000000000}*rho;
T u_0 = -x4*(x0 + x3 - f_curr[6] - f_curr[7]);
-T x6 = u_0*u_0;
-T x13 = -T{3.00000000000000}*x6;
-T x16 = T{6.00000000000000}*u_0;
-T x17 = -x16;
-T x30 = T{0.0277777777777778}*u_0;
+T x5 = u_0*u_0;
+T x12 = -T{3.00000000000000}*x5;
+T x15 = T{6.00000000000000}*u_0;
+T x16 = -x15;
T u_1 = -x4*(x1 + x3 - f_curr[2] - f_curr[5]);
-T x5 = T{0.0277777777777778}*u_1;
-T x7 = u_1*u_1;
-T x8 = x6 + x7;
-T x9 = pow(x8, T{-0.500000000000000});
-T x12 = -u_0 + u_1;
-T x14 = T{6.00000000000000}*u_1;
-T x15 = x13 + x14;
-T x18 = T{2.00000000000000} - T{3.00000000000000}*x7;
-T x19 = x17 + x18;
-T x20 = rho*(x15 + x19 + T{9.00000000000000}*(x12*x12));
-T x21 = u_0 - u_1;
-T x22 = x13 - x14;
-T x23 = x16 + x18;
-T x24 = rho*(x22 + x23 + T{9.00000000000000}*(x21*x21));
-T x25 = u_0 + u_1;
-T x26 = T{9.00000000000000}*(x25*x25);
-T x27 = rho*(x15 + x23 + x26) + rho*(x19 + x22 + x26) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8];
-T x28 = x10 + x11 - x20 - x24 + x27;
-T x29 = x28*x9;
-T x32 = x18 + T{6.00000000000000}*x6;
-T x33 = -x10 - x11 + x20 + x24 + x27;
-T x34 = x31*(x16 + x32) + x31*(x17 + x32) + x33 - T{72.0000000000000}*f_curr[1] - T{72.0000000000000}*f_curr[7];
-T x35 = T{6.00000000000000}*x7 + T{2.00000000000000};
-T x36 = x31*(x15 + x35) + x31*(x22 + x35) + x33 - T{72.0000000000000}*f_curr[3] - T{72.0000000000000}*f_curr[5];
-T x37 = ((x28*u_0 + x36*u_1)*u_1 + (x28*u_1 + x34*u_0)*u_0)/x8;
-T n_0 = -x29*x5 - x30*x34*x9 + x30*x37;
-T n_1 = -x29*x30 - x36*x5*x9 + x37*x5;
+T x6 = u_1*u_1;
+T x7 = x5 + x6;
+T x8 = pow(x7, T{-0.500000000000000});
+T x11 = -u_0 + u_1;
+T x13 = T{6.00000000000000}*u_1;
+T x14 = x12 + x13;
+T x17 = T{2.00000000000000} - T{3.00000000000000}*x6;
+T x18 = x16 + x17;
+T x19 = rho*(x14 + x18 + T{9.00000000000000}*(x11*x11));
+T x20 = u_0 - u_1;
+T x21 = x12 - x13;
+T x22 = x15 + x17;
+T x23 = rho*(x21 + x22 + T{9.00000000000000}*(x20*x20));
+T x24 = u_0 + u_1;
+T x25 = T{9.00000000000000}*(x24*x24);
+T x26 = rho*(x14 + x22 + x25) + rho*(x18 + x21 + x25) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8];
+T x27 = x10 - x19 - x23 + x26 + x9;
+T x28 = x27*x8;
+T x30 = x17 + T{6.00000000000000}*x5;
+T x31 = -x10 + x19 + x23 + x26 - x9;
+T x32 = x29*(x15 + x30) + x29*(x16 + x30) + x31 - T{72.0000000000000}*f_curr[1] - T{72.0000000000000}*f_curr[7];
+T x33 = T{6.00000000000000}*x6 + T{2.00000000000000};
+T x34 = x29*(x14 + x33) + x29*(x21 + x33) + x31 - T{72.0000000000000}*f_curr[3] - T{72.0000000000000}*f_curr[5];
+T x35 = ((x27*u_0 + x34*u_1)*u_1 + (x27*u_1 + x32*u_0)*u_0)/x7;
+T n_0 = -T{0.0277777777777778}*x28*u_1 - T{0.0277777777777778}*x32*x8*u_0 + T{0.0277777777777778}*x35*u_0;
+T n_1 = -T{0.0277777777777778}*x28*u_0 - T{0.0277777777777778}*x34*x8*u_1 + T{0.0277777777777778}*x35*u_1;
#+end_example
*** Determine shear layer visibility
@@ -3877,11 +3876,14 @@ T x17 = u_0 + u_1;
T x18 = T{9.00000000000000}*(x17*x17);
T x19 = x11 + x9 + T{-2.00000000000000};
T x20 = rho*(x14 + x18 + x2) - rho*(-x18 + x19 + x2 + x7) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[8];
-T x21 = T{4.00000000000000}*rho;
-T x22 = T{6.00000000000000}*x10 + x8;
-T x23 = -x0 - x1 + x15 + x16 + x20;
-T x24 = T{6.00000000000000}*x6;
-T strain = T{0.0277777777777778}*sqrt((x0 + x1 - x15 - x16 + x20)*(x0 + x1 - x15 - x16 + x20) + T{0.500000000000000}*((-x21*(x19 - x24) + x21*(x13 + x24 + 2) + x23 - 72*f_curr[3] - 72*f_curr[5])*(-x21*(x19 - x24) + x21*(x13 + x24 + 2) + x23 - 72*f_curr[3] - 72*f_curr[5])) + T{0.500000000000000}*((x21*(x2 + x22) + x21*(x22 + x3) + x23 - 72*f_curr[1] - 72*f_curr[7])*(x21*(x2 + x22) + x21*(x22 + x3) + x23 - 72*f_curr[1] - 72*f_curr[7])));
+T x21 = x0 + x1 - x15 - x16 + x20;
+T x22 = T{4.00000000000000}*rho;
+T x23 = T{6.00000000000000}*x10 + x8;
+T x24 = -x0 - x1 + x15 + x16 + x20;
+T x25 = x22*(x2 + x23) + x22*(x23 + x3) + x24 - T{72.0000000000000}*f_curr[1] - T{72.0000000000000}*f_curr[7];
+T x26 = T{6.00000000000000}*x6;
+T x27 = -x22*(x19 - x26) + x22*(x13 + x26 + T{2.00000000000000}) + x24 - T{72.0000000000000}*f_curr[3] - T{72.0000000000000}*f_curr[5];
+T strain = T{0.0277777777777778}*sqrt(x21*x21 + T{0.500000000000000}*(x25*x25) + T{0.500000000000000}*(x27*x27));
#+end_example
#+BEGIN_SRC cpp :tangle tangle/LLBM/kernel/collect_q_criterion.h
@@ -5788,21 +5790,29 @@ nvidia-smi --query-gpu=name --format=csv,noheader
#+END_SRC
#+RESULTS:
-: GeForce RTX 2070
+: GeForce RTX 3070
#+NAME: benchmark-ldc
-#+BEGIN_SRC bash :dir build :eval query :var min=64 :var max=128 :var step=16 :var nSteps=1000 :async t
+#+BEGIN_SRC bash :dir build :eval query :var min=64 :var max=256 :var step=16 :var nSteps=1000 :async t
for n in $(seq $min $step $max); do
./benchmark-ldc $n $nSteps
done
#+END_SRC
#+RESULTS: benchmark-ldc
-| 4 | 64 | 1000 | 2299.85 |
-| 4 | 80 | 1000 | 2321.52 |
-| 4 | 96 | 1000 | 2456.87 |
-| 4 | 112 | 1000 | 2456.11 |
-| 4 | 128 | 1000 | 2462.46 |
+| 4 | 64 | 1000 | 2416.56 |
+| 4 | 80 | 1000 | 2471.92 |
+| 4 | 96 | 1000 | 2534.33 |
+| 4 | 112 | 1000 | 2512.18 |
+| 4 | 128 | 1000 | 2569.58 |
+| 4 | 144 | 1000 | 2541.29 |
+| 4 | 160 | 1000 | 2599.92 |
+| 4 | 176 | 1000 | 2499.82 |
+| 4 | 192 | 1000 | 2513.63 |
+| 4 | 208 | 1000 | 2492.54 |
+| 4 | 224 | 1000 | 2533.04 |
+| 4 | 240 | 1000 | 2561.12 |
+| 4 | 256 | 1000 | 2511.97 |
* Open tasks
:properties:
diff --git a/tangle/LLBM/kernel/collect_q_criterion.h b/tangle/LLBM/kernel/collect_q_criterion.h
index 19b7f68..fa19dc7 100644
--- a/tangle/LLBM/kernel/collect_q_criterion.h
+++ b/tangle/LLBM/kernel/collect_q_criterion.h
@@ -56,39 +56,45 @@ __device__ static void apply(
T x28 = -x13;
T x29 = x2 + x28;
T x30 = -rho*(x14 + x21 - x24) + rho*(x14 + x24 + x27 + x29) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
- T x31 = T{72.0000000000000}*f_curr[1];
- T x32 = T{72.0000000000000}*f_curr[17];
- T x33 = x4 + u_2;
- T x34 = T{6.00000000000000}*u_2;
- T x35 = x11 - x34;
- T x36 = rho*(x15 + x35 - T{9.00000000000000}*x33*x33);
- T x37 = -u_2;
- T x38 = x37 + u_0;
- T x39 = x11 + x34;
- T x40 = x13 + x39;
- T x41 = rho*(x17 + x40 - T{9.00000000000000}*x38*x38);
- T x42 = u_0 + u_2;
- T x43 = T{9.00000000000000}*(x42*x42);
- T x44 = x27 + x34;
- T x45 = x14 + x28;
- T x46 = -rho*(x15 + x39 - x43) + rho*(x43 + x44 + x45) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
- T x47 = T{72.0000000000000}*f_curr[4];
- T x48 = T{72.0000000000000}*f_curr[14];
- T x49 = x18 + u_2;
- T x50 = rho*(x20 + x35 - T{9.00000000000000}*x49*x49);
- T x51 = x37 + u_1;
- T x52 = rho*(x3 + x40 - T{9.00000000000000}*x51*x51);
- T x53 = u_1 + u_2;
- T x54 = T{9.00000000000000}*(x53*x53);
- T x55 = -rho*(x20 + x39 - x54) + rho*(x29 + x44 + x54) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
- T x56 = T{2.00000000000000}*rho;
- T x57 = T{6.00000000000000}*x8;
- T x58 = -x31 - x32 - x36 - x41 + x46;
- T x59 = -x0 - x1 - x16 - x22 + x30;
- T x60 = T{6.00000000000000}*x6;
- T x61 = -x47 - x48 - x50 - x52 + x55;
- T x62 = T{6.00000000000000}*x12;
- T strain = T{0.0277777777777778}*sqrt((x0 + x1 + x16 + x22 + x30)*(x0 + x1 + x16 + x22 + x30) + (x31 + x32 + x36 + x41 + x46)*(x31 + x32 + x36 + x41 + x46) + (x47 + x48 + x50 + x52 + x55)*(x47 + x48 + x50 + x52 + x55) + T{0.500000000000000}*((-x56*(x39 - x62) + x56*(x44 + x62) + x58 + x61 - 72*f_curr[16] - 72*f_curr[2])*(-x56*(x39 - x62) + x56*(x44 + x62) + x58 + x61 - 72*f_curr[16] - 72*f_curr[2])) + T{0.500000000000000}*((-x56*(x10 + x20 - x60) + x56*(x26 + x29 + x60) + x59 + x61 - 72*f_curr[12] - 72*f_curr[6])*(-x56*(x10 + x20 - x60) + x56*(x26 + x29 + x60) + x59 + x61 - 72*f_curr[12] - 72*f_curr[6])) + T{0.500000000000000}*((-x56*(x15 - x57 + x7 - 2) + x56*(x25 + x45 + x57 + 2) + x58 + x59 - 72*f_curr[10] - 72*f_curr[8])*(-x56*(x15 - x57 + x7 - 2) + x56*(x25 + x45 + x57 + 2) + x58 + x59 - 72*f_curr[10] - 72*f_curr[8])));
+ T x31 = x0 + x1 + x16 + x22 + x30;
+ T x32 = T{72.0000000000000}*f_curr[1];
+ T x33 = T{72.0000000000000}*f_curr[17];
+ T x34 = x4 + u_2;
+ T x35 = T{6.00000000000000}*u_2;
+ T x36 = x11 - x35;
+ T x37 = rho*(x15 + x36 - T{9.00000000000000}*x34*x34);
+ T x38 = -u_2;
+ T x39 = x38 + u_0;
+ T x40 = x11 + x35;
+ T x41 = x13 + x40;
+ T x42 = rho*(x17 + x41 - T{9.00000000000000}*x39*x39);
+ T x43 = u_0 + u_2;
+ T x44 = T{9.00000000000000}*(x43*x43);
+ T x45 = x27 + x35;
+ T x46 = x14 + x28;
+ T x47 = -rho*(x15 + x40 - x44) + rho*(x44 + x45 + x46) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
+ T x48 = x32 + x33 + x37 + x42 + x47;
+ T x49 = T{72.0000000000000}*f_curr[4];
+ T x50 = T{72.0000000000000}*f_curr[14];
+ T x51 = x18 + u_2;
+ T x52 = rho*(x20 + x36 - T{9.00000000000000}*x51*x51);
+ T x53 = x38 + u_1;
+ T x54 = rho*(x3 + x41 - T{9.00000000000000}*x53*x53);
+ T x55 = u_1 + u_2;
+ T x56 = T{9.00000000000000}*(x55*x55);
+ T x57 = -rho*(x20 + x40 - x56) + rho*(x29 + x45 + x56) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
+ T x58 = x49 + x50 + x52 + x54 + x57;
+ T x59 = T{2.00000000000000}*rho;
+ T x60 = T{6.00000000000000}*x8;
+ T x61 = -x32 - x33 - x37 - x42 + x47;
+ T x62 = -x0 - x1 - x16 - x22 + x30;
+ T x63 = -x59*(x15 - x60 + x7 + T{-2.00000000000000}) + x59*(x25 + x46 + x60 + T{2.00000000000000}) + x61 + x62 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8];
+ T x64 = T{6.00000000000000}*x6;
+ T x65 = -x49 - x50 - x52 - x54 + x57;
+ T x66 = -x59*(x10 + x20 - x64) + x59*(x26 + x29 + x64) + x62 + x65 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6];
+ T x67 = T{6.00000000000000}*x12;
+ T x68 = -x59*(x40 - x67) + x59*(x45 + x67) + x61 + x65 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2];
+ T strain = T{0.0277777777777778}*sqrt(x31*x31 + x48*x48 + x58*x58 + T{0.500000000000000}*(x63*x63) + T{0.500000000000000}*(x66*x66) + T{0.500000000000000}*(x68*x68));
float vorticity = cell_curl_norm[gid];
float q = vorticity*vorticity - strain*strain;
diff --git a/tangle/LLBM/kernel/collect_shear_layer_normal.h b/tangle/LLBM/kernel/collect_shear_layer_normal.h
index 7bf6eff..13e839a 100644
--- a/tangle/LLBM/kernel/collect_shear_layer_normal.h
+++ b/tangle/LLBM/kernel/collect_shear_layer_normal.h
@@ -22,85 +22,82 @@ __device__ static void apply(
T x5 = -f_curr[15] + f_curr[3];
T x6 = T{1} / (x3);
T x7 = f_curr[0] - f_curr[18];
- T x14 = T{72.0000000000000}*f_curr[5];
- T x15 = T{72.0000000000000}*f_curr[13];
- T x39 = T{72.0000000000000}*f_curr[1];
- T x40 = T{72.0000000000000}*f_curr[17];
- T x61 = T{72.0000000000000}*f_curr[4];
- T x62 = T{72.0000000000000}*f_curr[14];
+ T x13 = T{72.0000000000000}*f_curr[5];
+ T x14 = T{72.0000000000000}*f_curr[13];
+ T x37 = T{72.0000000000000}*f_curr[1];
+ T x38 = T{72.0000000000000}*f_curr[17];
+ T x58 = T{72.0000000000000}*f_curr[4];
+ T x59 = T{72.0000000000000}*f_curr[14];
T rho = x3;
- T x56 = T{2.00000000000000}*rho;
+ T x53 = T{2.00000000000000}*rho;
T u_0 = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]);
- T x9 = u_0*u_0;
- T x16 = -u_0;
- T x18 = -T{3.00000000000000}*x9;
- T x21 = T{6.00000000000000}*u_0;
- T x22 = -x21;
- T x55 = T{0.0277777777777778}*u_0;
+ T x8 = u_0*u_0;
+ T x15 = -u_0;
+ T x17 = -T{3.00000000000000}*x8;
+ T x20 = T{6.00000000000000}*u_0;
+ T x21 = -x20;
T u_1 = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]);
- T x8 = T{0.0277777777777778}*u_1;
- T x10 = u_1*u_1;
- T x17 = x16 + u_1;
- T x19 = T{6.00000000000000}*u_1;
- T x20 = x18 + x19;
- T x23 = -T{3.00000000000000}*x10;
- T x28 = -u_1;
- T x29 = x28 + u_0;
- T x30 = x18 - x19;
- T x33 = u_0 + u_1;
- T x34 = T{9.00000000000000}*(x33*x33);
+ T x9 = u_1*u_1;
+ T x16 = x15 + u_1;
+ T x18 = T{6.00000000000000}*u_1;
+ T x19 = x17 + x18;
+ T x22 = -T{3.00000000000000}*x9;
+ T x27 = -u_1;
+ T x28 = x27 + u_0;
+ T x29 = x17 - x18;
+ T x32 = u_0 + u_1;
+ T x33 = T{9.00000000000000}*(x32*x32);
T u_2 = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]);
- T x11 = u_2*u_2;
- T x12 = x10 + x11 + x9;
- T x13 = pow(x12, T{-0.500000000000000});
- T x24 = T{2.00000000000000} - T{3.00000000000000}*x11;
- T x25 = x23 + x24;
- T x26 = x22 + x25;
- T x27 = rho*(x20 + x26 + T{9.00000000000000}*(x17*x17));
- T x31 = x21 + x25;
- T x32 = rho*(x30 + x31 + T{9.00000000000000}*(x29*x29));
- T x35 = rho*(x20 + x31 + x34) + rho*(x26 + x30 + x34) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
- T x36 = x14 + x15 - x27 - x32 + x35;
- T x37 = x13*x36;
- T x38 = T{0.0277777777777778}*u_2;
- T x41 = x16 + u_2;
- T x42 = T{6.00000000000000}*u_2;
- T x43 = x18 + x42;
- T x44 = rho*(x26 + x43 + T{9.00000000000000}*(x41*x41));
- T x45 = -u_2;
- T x46 = x45 + u_0;
- T x47 = -x42;
- T x48 = x18 + x47;
- T x49 = rho*(x31 + x48 + T{9.00000000000000}*(x46*x46));
- T x50 = u_0 + u_2;
- T x51 = T{9.00000000000000}*(x50*x50);
- T x52 = rho*(x26 + x48 + x51) + rho*(x31 + x43 + x51) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
- T x53 = x39 + x40 - x44 - x49 + x52;
- T x54 = x13*x53;
- T x57 = x25 + T{6.00000000000000}*x9;
- T x58 = -x14 - x15 + x27 + x32 + x35;
- T x59 = -x39 - x40 + x44 + x49 + x52;
- T x60 = x56*(x21 + x57) + x56*(x22 + x57) + x58 + x59 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8];
- T x63 = x28 + u_2;
- T x64 = x25 + x30;
- T x65 = rho*(x42 + x64 + T{9.00000000000000}*(x63*x63));
- T x66 = x45 + u_1;
- T x67 = x20 + x25;
- T x68 = rho*(x47 + x67 + T{9.00000000000000}*(x66*x66));
- T x69 = u_1 + u_2;
- T x70 = T{9.00000000000000}*(x69*x69);
- T x71 = rho*(x42 + x67 + x70) + rho*(x47 + x64 + x70) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
- T x72 = x61 + x62 - x65 - x68 + x71;
- T x73 = T{6.00000000000000}*x10 + x24;
- T x74 = -x61 - x62 + x65 + x68 + x71;
- T x75 = x56*(x20 + x73) + x56*(x30 + x73) + x58 + x74 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6];
- T x76 = T{6.00000000000000}*x11 + x23 + T{2.00000000000000};
- T x77 = x56*(x43 + x76) + x56*(x48 + x76) + x59 + x74 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2];
- T x78 = ((x36*u_0 + x72*u_2 + x75*u_1)*u_1 + (x36*u_1 + x53*u_2 + x60*u_0)*u_0 + (x53*u_0 + x72*u_1 + x77*u_2)*u_2)/x12;
- T x79 = x13*x72;
- T n_0 = -x13*x55*x60 - x37*x8 - x38*x54 + x55*x78;
- T n_1 = -x13*x75*x8 - x37*x55 - x38*x79 + x78*x8;
- T n_2 = -x13*x38*x77 + x38*x78 - x54*x55 - x79*x8;
+ T x10 = u_2*u_2;
+ T x11 = x10 + x8 + x9;
+ T x12 = pow(x11, T{-0.500000000000000});
+ T x23 = T{2.00000000000000} - T{3.00000000000000}*x10;
+ T x24 = x22 + x23;
+ T x25 = x21 + x24;
+ T x26 = rho*(x19 + x25 + T{9.00000000000000}*(x16*x16));
+ T x30 = x20 + x24;
+ T x31 = rho*(x29 + x30 + T{9.00000000000000}*(x28*x28));
+ T x34 = rho*(x19 + x30 + x33) + rho*(x25 + x29 + x33) - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
+ T x35 = x13 + x14 - x26 - x31 + x34;
+ T x36 = x12*x35;
+ T x39 = x15 + u_2;
+ T x40 = T{6.00000000000000}*u_2;
+ T x41 = x17 + x40;
+ T x42 = rho*(x25 + x41 + T{9.00000000000000}*(x39*x39));
+ T x43 = -u_2;
+ T x44 = x43 + u_0;
+ T x45 = -x40;
+ T x46 = x17 + x45;
+ T x47 = rho*(x30 + x46 + T{9.00000000000000}*(x44*x44));
+ T x48 = u_0 + u_2;
+ T x49 = T{9.00000000000000}*(x48*x48);
+ T x50 = rho*(x25 + x46 + x49) + rho*(x30 + x41 + x49) - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
+ T x51 = x37 + x38 - x42 - x47 + x50;
+ T x52 = x12*x51;
+ T x54 = x24 + T{6.00000000000000}*x8;
+ T x55 = -x13 - x14 + x26 + x31 + x34;
+ T x56 = -x37 - x38 + x42 + x47 + x50;
+ T x57 = x53*(x20 + x54) + x53*(x21 + x54) + x55 + x56 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8];
+ T x60 = x27 + u_2;
+ T x61 = x24 + x29;
+ T x62 = rho*(x40 + x61 + T{9.00000000000000}*(x60*x60));
+ T x63 = x43 + u_1;
+ T x64 = x19 + x24;
+ T x65 = rho*(x45 + x64 + T{9.00000000000000}*(x63*x63));
+ T x66 = u_1 + u_2;
+ T x67 = T{9.00000000000000}*(x66*x66);
+ T x68 = rho*(x40 + x64 + x67) + rho*(x45 + x61 + x67) - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
+ T x69 = x58 + x59 - x62 - x65 + x68;
+ T x70 = x23 + T{6.00000000000000}*x9;
+ T x71 = -x58 - x59 + x62 + x65 + x68;
+ T x72 = x53*(x19 + x70) + x53*(x29 + x70) + x55 + x71 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6];
+ T x73 = T{6.00000000000000}*x10 + x22 + T{2.00000000000000};
+ T x74 = x53*(x41 + x73) + x53*(x46 + x73) + x56 + x71 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2];
+ T x75 = ((x35*u_0 + x69*u_2 + x72*u_1)*u_1 + (x35*u_1 + x51*u_2 + x57*u_0)*u_0 + (x51*u_0 + x69*u_1 + x74*u_2)*u_2)/x11;
+ T x76 = x12*x69;
+ T n_0 = -T{0.0277777777777778}*x12*x57*u_0 - T{0.0277777777777778}*x36*u_1 - T{0.0277777777777778}*x52*u_2 + T{0.0277777777777778}*x75*u_0;
+ T n_1 = -T{0.0277777777777778}*x12*x72*u_1 - T{0.0277777777777778}*x36*u_0 + T{0.0277777777777778}*x75*u_1 - T{0.0277777777777778}*x76*u_2;
+ T n_2 = -T{0.0277777777777778}*x12*x74*u_2 - T{0.0277777777777778}*x52*u_0 + T{0.0277777777777778}*x75*u_2 - T{0.0277777777777778}*x76*u_1;
cell_rho[gid] = rho;
diff --git a/tangle/LLBM/kernel/smagorinsky_collide.h b/tangle/LLBM/kernel/smagorinsky_collide.h
index 3489479..a7355c5 100644
--- a/tangle/LLBM/kernel/smagorinsky_collide.h
+++ b/tangle/LLBM/kernel/smagorinsky_collide.h
@@ -75,13 +75,13 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std:
T x7 = f_curr[0] - f_curr[18];
T x8 = T{72.0000000000000}*f_curr[5];
T x9 = T{72.0000000000000}*f_curr[13];
- T x42 = T{72.0000000000000}*f_curr[1];
- T x43 = T{72.0000000000000}*f_curr[17];
- T x61 = T{72.0000000000000}*f_curr[4];
- T x62 = T{72.0000000000000}*f_curr[14];
+ T x43 = T{72.0000000000000}*f_curr[1];
+ T x44 = T{72.0000000000000}*f_curr[17];
+ T x63 = T{72.0000000000000}*f_curr[4];
+ T x64 = T{72.0000000000000}*f_curr[14];
T rho = x3;
- T x74 = T{2.00000000000000}*rho;
- T x89 = T{2.00000000000000}*rho;
+ T x77 = T{2.00000000000000}*rho;
+ T x95 = T{2.00000000000000}*rho;
T u_0 = x6*(x0 + x4 + x5 - f_curr[1] - f_curr[5] - f_curr[8]);
T x12 = -u_0;
T x14 = T{6.00000000000000}*u_0;
@@ -89,7 +89,7 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std:
T x18 = T{3.00000000000000}*x17;
T x26 = -x14;
T x35 = T{2.00000000000000} - x18;
- T x75 = T{6.00000000000000}*x17;
+ T x78 = T{6.00000000000000}*x17;
T u_1 = x6*(x1 + x4 + x7 - f_curr[12] - f_curr[13] - f_curr[4]);
T x10 = T{6.00000000000000}*u_1;
T x11 = -x10;
@@ -103,7 +103,7 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std:
T x33 = T{9.00000000000000}*(x32*x32);
T x34 = -x20;
T x36 = x34 + x35;
- T x81 = T{6.00000000000000}*x19;
+ T x85 = T{6.00000000000000}*x19;
T u_2 = x6*(x2 + x5 + x7 - f_curr[14] - f_curr[16] - f_curr[17]);
T x15 = u_2*u_2;
T x16 = T{3.00000000000000}*x15;
@@ -119,65 +119,71 @@ __device__ static void apply(descriptor::D3Q19, S f_curr[19], S f_next[19], std:
T x39 = x14 + x33 + x36 + x38;
T x40 = x14 + x29 - x33;
T x41 = rho*x39 - rho*x40 - T{72.0000000000000}*f_curr[11] - T{72.0000000000000}*f_curr[7];
- T x44 = T{6.00000000000000}*u_2;
- T x45 = -x44;
- T x46 = x12 + u_2;
- T x47 = x23 + x45 - T{9.00000000000000}*x46*x46;
- T x48 = rho*x47;
- T x49 = -u_2;
- T x50 = x49 + u_0;
- T x51 = x22 + x44;
- T x52 = x26 + x51 - T{9.00000000000000}*x50*x50;
- T x53 = rho*x52;
- T x54 = u_0 + u_2;
- T x55 = T{9.00000000000000}*(x54*x54);
- T x56 = x36 + x44;
- T x57 = x14 + x37;
- T x58 = x55 + x56 + x57;
- T x59 = x14 + x51 - x55;
- T x60 = rho*x58 - rho*x59 - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
- T x63 = x27 + u_2;
- T x64 = x29 + x45 - T{9.00000000000000}*x63*x63;
- T x65 = rho*x64;
- T x66 = x49 + u_1;
- T x67 = x11 + x51 - T{9.00000000000000}*x66*x66;
- T x68 = rho*x67;
- T x69 = u_1 + u_2;
- T x70 = T{9.00000000000000}*(x69*x69);
- T x71 = x38 + x56 + x70;
- T x72 = x29 + x44 - x70;
- T x73 = rho*x71 - rho*x72 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
- T x76 = x16 + T{-2.00000000000000};
- T x77 = x14 + x20 - x75 + x76;
- T x78 = x34 + x57 + x75 + T{2.00000000000000};
- T x79 = -x42 - x43 - x48 - x53 + x60;
- T x80 = -x25 - x31 + x41 - x8 - x9;
- T x82 = x10 + x18 + x76 - x81;
- T x83 = x35 + x38 + x81;
- T x84 = -x61 - x62 - x65 - x68 + x73;
- T x85 = T{6.00000000000000}*x15;
- T x86 = x21 + x44 - x85;
- T x87 = x56 + x85;
- T x88 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt((x25 + x31 + x41 + x8 + x9)*(x25 + x31 + x41 + x8 + x9) + (x42 + x43 + x48 + x53 + x60)*(x42 + x43 + x48 + x53 + x60) + (x61 + x62 + x65 + x68 + x73)*(x61 + x62 + x65 + x68 + x73) + T{0.500000000000000}*((-x74*x77 + x74*x78 + x79 + x80 - 72*f_curr[10] - 72*f_curr[8])*(-x74*x77 + x74*x78 + x79 + x80 - 72*f_curr[10] - 72*f_curr[8])) + T{0.500000000000000}*((-x74*x82 + x74*x83 + x80 + x84 - 72*f_curr[12] - 72*f_curr[6])*(-x74*x82 + x74*x83 + x80 + x84 - 72*f_curr[12] - 72*f_curr[6])) + T{0.500000000000000}*((-x74*x86 + x74*x87 + x79 + x84 - 72*f_curr[16] - 72*f_curr[2])*(-x74*x86 + x74*x87 + x79 + x84 - 72*f_curr[16] - 72*f_curr[2]))) + tau*tau));
- f_next[0] = T{0.0138888888888889}*x88*(x71*x89 - T{144.000000000000}*f_curr[0]) + f_curr[0];
- f_next[1] = -T{0.0138888888888889}*x88*(x47*x89 + T{144.000000000000}*f_curr[1]) + f_curr[1];
- f_next[2] = T{0.0277777777777778}*x88*(x87*x89 - T{72.0000000000000}*f_curr[2]) + f_curr[2];
- f_next[3] = T{0.0138888888888889}*x88*(x58*x89 - T{144.000000000000}*f_curr[3]) + f_curr[3];
- f_next[4] = -T{0.0138888888888889}*x88*(x64*x89 + T{144.000000000000}*f_curr[4]) + f_curr[4];
- f_next[5] = -T{0.0138888888888889}*x88*(x24*x89 + T{144.000000000000}*f_curr[5]) + f_curr[5];
- f_next[6] = T{0.0277777777777778}*x88*(x83*x89 - T{72.0000000000000}*f_curr[6]) + f_curr[6];
- f_next[7] = T{0.0138888888888889}*x88*(x39*x89 - T{144.000000000000}*f_curr[7]) + f_curr[7];
- f_next[8] = -T{0.0277777777777778}*x88*(x77*x89 + T{72.0000000000000}*f_curr[8]) + f_curr[8];
- f_next[9] = -T{0.166666666666667}*x88*(x22*x89 + T{12.0000000000000}*f_curr[9]) + f_curr[9];
- f_next[10] = T{0.0277777777777778}*x88*(x78*x89 - T{72.0000000000000}*f_curr[10]) + f_curr[10];
- f_next[11] = -T{0.0138888888888889}*x88*(x40*x89 + T{144.000000000000}*f_curr[11]) + f_curr[11];
- f_next[12] = -T{0.0277777777777778}*x88*(x82*x89 + T{72.0000000000000}*f_curr[12]) + f_curr[12];
- f_next[13] = -T{0.0138888888888889}*x88*(x30*x89 + T{144.000000000000}*f_curr[13]) + f_curr[13];
- f_next[14] = -T{0.0138888888888889}*x88*(x67*x89 + T{144.000000000000}*f_curr[14]) + f_curr[14];
- f_next[15] = -T{0.0138888888888889}*x88*(x59*x89 + T{144.000000000000}*f_curr[15]) + f_curr[15];
- f_next[16] = -T{0.0277777777777778}*x88*(x86*x89 + T{72.0000000000000}*f_curr[16]) + f_curr[16];
- f_next[17] = -T{0.0138888888888889}*x88*(x52*x89 + T{144.000000000000}*f_curr[17]) + f_curr[17];
- f_next[18] = -T{0.0138888888888889}*x88*(x72*x89 + T{144.000000000000}*f_curr[18]) + f_curr[18];
+ T x42 = x25 + x31 + x41 + x8 + x9;
+ T x45 = T{6.00000000000000}*u_2;
+ T x46 = -x45;
+ T x47 = x12 + u_2;
+ T x48 = x23 + x46 - T{9.00000000000000}*x47*x47;
+ T x49 = rho*x48;
+ T x50 = -u_2;
+ T x51 = x50 + u_0;
+ T x52 = x22 + x45;
+ T x53 = x26 + x52 - T{9.00000000000000}*x51*x51;
+ T x54 = rho*x53;
+ T x55 = u_0 + u_2;
+ T x56 = T{9.00000000000000}*(x55*x55);
+ T x57 = x36 + x45;
+ T x58 = x14 + x37;
+ T x59 = x56 + x57 + x58;
+ T x60 = x14 + x52 - x56;
+ T x61 = rho*x59 - rho*x60 - T{72.0000000000000}*f_curr[15] - T{72.0000000000000}*f_curr[3];
+ T x62 = x43 + x44 + x49 + x54 + x61;
+ T x65 = x27 + u_2;
+ T x66 = x29 + x46 - T{9.00000000000000}*x65*x65;
+ T x67 = rho*x66;
+ T x68 = x50 + u_1;
+ T x69 = x11 + x52 - T{9.00000000000000}*x68*x68;
+ T x70 = rho*x69;
+ T x71 = u_1 + u_2;
+ T x72 = T{9.00000000000000}*(x71*x71);
+ T x73 = x38 + x57 + x72;
+ T x74 = x29 + x45 - x72;
+ T x75 = rho*x73 - rho*x74 - T{72.0000000000000}*f_curr[0] - T{72.0000000000000}*f_curr[18];
+ T x76 = x63 + x64 + x67 + x70 + x75;
+ T x79 = x16 + T{-2.00000000000000};
+ T x80 = x14 + x20 - x78 + x79;
+ T x81 = x34 + x58 + x78 + T{2.00000000000000};
+ T x82 = -x43 - x44 - x49 - x54 + x61;
+ T x83 = -x25 - x31 + x41 - x8 - x9;
+ T x84 = -x77*x80 + x77*x81 + x82 + x83 - T{72.0000000000000}*f_curr[10] - T{72.0000000000000}*f_curr[8];
+ T x86 = x10 + x18 + x79 - x85;
+ T x87 = x35 + x38 + x85;
+ T x88 = -x63 - x64 - x67 - x70 + x75;
+ T x89 = -x77*x86 + x77*x87 + x83 + x88 - T{72.0000000000000}*f_curr[12] - T{72.0000000000000}*f_curr[6];
+ T x90 = T{6.00000000000000}*x15;
+ T x91 = x21 + x45 - x90;
+ T x92 = x57 + x90;
+ T x93 = -x77*x91 + x77*x92 + x82 + x88 - T{72.0000000000000}*f_curr[16] - T{72.0000000000000}*f_curr[2];
+ T x94 = T{1} / (tau + sqrt(T{0.707106781186548}*(smagorinsky*smagorinsky)*sqrt(x42*x42 + x62*x62 + x76*x76 + T{0.500000000000000}*(x84*x84) + T{0.500000000000000}*(x89*x89) + T{0.500000000000000}*(x93*x93)) + tau*tau));
+ f_next[0] = T{0.0138888888888889}*x94*(x73*x95 - T{144.000000000000}*f_curr[0]) + f_curr[0];
+ f_next[1] = -T{0.0138888888888889}*x94*(x48*x95 + T{144.000000000000}*f_curr[1]) + f_curr[1];
+ f_next[2] = T{0.0277777777777778}*x94*(x92*x95 - T{72.0000000000000}*f_curr[2]) + f_curr[2];
+ f_next[3] = T{0.0138888888888889}*x94*(x59*x95 - T{144.000000000000}*f_curr[3]) + f_curr[3];
+ f_next[4] = -T{0.0138888888888889}*x94*(x66*x95 + T{144.000000000000}*f_curr[4]) + f_curr[4];
+ f_next[5] = -T{0.0138888888888889}*x94*(x24*x95 + T{144.000000000000}*f_curr[5]) + f_curr[5];
+ f_next[6] = T{0.0277777777777778}*x94*(x87*x95 - T{72.0000000000000}*f_curr[6]) + f_curr[6];
+ f_next[7] = T{0.0138888888888889}*x94*(x39*x95 - T{144.000000000000}*f_curr[7]) + f_curr[7];
+ f_next[8] = -T{0.0277777777777778}*x94*(x80*x95 + T{72.0000000000000}*f_curr[8]) + f_curr[8];
+ f_next[9] = -T{0.166666666666667}*x94*(x22*x95 + T{12.0000000000000}*f_curr[9]) + f_curr[9];
+ f_next[10] = T{0.0277777777777778}*x94*(x81*x95 - T{72.0000000000000}*f_curr[10]) + f_curr[10];
+ f_next[11] = -T{0.0138888888888889}*x94*(x40*x95 + T{144.000000000000}*f_curr[11]) + f_curr[11];
+ f_next[12] = -T{0.0277777777777778}*x94*(x86*x95 + T{72.0000000000000}*f_curr[12]) + f_curr[12];
+ f_next[13] = -T{0.0138888888888889}*x94*(x30*x95 + T{144.000000000000}*f_curr[13]) + f_curr[13];
+ f_next[14] = -T{0.0138888888888889}*x94*(x69*x95 + T{144.000000000000}*f_curr[14]) + f_curr[14];
+ f_next[15] = -T{0.0138888888888889}*x94*(x60*x95 + T{144.000000000000}*f_curr[15]) + f_curr[15];
+ f_next[16] = -T{0.0277777777777778}*x94*(x91*x95 + T{72.0000000000000}*f_curr[16]) + f_curr[16];
+ f_next[17] = -T{0.0138888888888889}*x94*(x53*x95 + T{144.000000000000}*f_curr[17]) + f_curr[17];
+ f_next[18] = -T{0.0138888888888889}*x94*(x74*x95 + T{144.000000000000}*f_curr[18]) + f_curr[18];
}
};