From 5ac924371a7e53641a2f726a9f431ab8cb99f9fb Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Sat, 8 Jun 2019 23:08:28 +0200
Subject: Performance optimizations

Starting point: ~200 MLUPS on a NVidia K2200

Changes that did not noticeably impact performance:

* Memory layout AOS vs. SOA (weird, probably highly platform dependent)
* Propagate on read
* Tagging pointers as read / write only
* Manual code inlining

Changes that made things worse:

* Bad thread block sizes

The actual issue:

* Hidden double precision computations

=> Code now yields ~600 MLUPS
---
 implosion.py | 107 ++++++++++++++++++++++++++++-------------------------------
 shell.nix    |   1 +
 2 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/implosion.py b/implosion.py
index 1fe0537..491e2ff 100644
--- a/implosion.py
+++ b/implosion.py
@@ -6,7 +6,7 @@ from string import Template
 import numpy
 import matplotlib.pyplot as plt
 
-from timeit import default_timer as timer
+import time
 
 kernel = """
 float constant w[9] = {
@@ -15,6 +15,8 @@ float constant w[9] = {
     1./36 , 1./9., 1./36.
 };
 
+#define N_CELLS $nX*$nY
+
 unsigned int indexOfDirection(int i, int j) {
     return (i+1) + 3*(1-j);
 }
@@ -25,10 +27,10 @@ unsigned int indexOfCell(int x, int y)
 }
 
 unsigned int idx(int x, int y, int i, int j) {
-    return indexOfDirection(i,j)*$nX*$nY + indexOfCell(x,y);
+    return indexOfDirection(i,j)*N_CELLS + indexOfCell(x,y);
 }
 
-__global float f_i(__global const float* f, int x, int y, int i, int j) {
+__global float f_i(__global __read_only float* f, int x, int y, int i, int j) {
     return f[idx(x,y,i,j)];
 }
 
@@ -41,13 +43,13 @@ float sq(float x) {
 }
 
 float f_eq(float w, float d, float2 v, int i, int j, float dotv) {
-    return w * d * (1 + 3*comp(i,j,v) + 4.5*sq(comp(i,j,v)) - 1.5*dotv);
+    return w * d * (1.f + 3.f*comp(i,j,v) + 4.5f*sq(comp(i,j,v)) - 1.5f*dotv);
 }
 
-__kernel void collide_and_stream(__global       float* f_a,
-                                 __global const float* f_b,
-                                 __global       float* moments,
-                                 __global const int* material)
+__kernel void collide_and_stream(__global __write_only float* f_a,
+                                 __global __read_only  float* f_b,
+                                 __global __write_only float* moments,
+                                 __global __read_only  int* material)
 {
     const unsigned int gid = indexOfCell(get_global_id(0), get_global_id(1));
 
@@ -59,24 +61,17 @@ __kernel void collide_and_stream(__global       float* f_a,
         return;
     }
 
-    float f0 = f_b[0*$nX*$nY + gid];
-    float d = f0;
-    float f1 = f_b[1*$nX*$nY + gid];
-    d += f1;
-    float f2 = f_b[2*$nX*$nY + gid];
-    d += f2;
-    float f3 = f_b[3*$nX*$nY + gid];
-    d += f3;
-    float f4 = f_b[4*$nX*$nY + gid];
-    d += f4;
-    float f5 = f_b[5*$nX*$nY + gid];
-    d += f5;
-    float f6 = f_b[6*$nX*$nY + gid];
-    d += f6;
-    float f7 = f_b[7*$nX*$nY + gid];
-    d += f7;
-    float f8 = f_b[8*$nX*$nY + gid];
-    d += f8;
+    float f0 = f_i(f_b, cell.x+1, cell.y-1, -1, 1);
+    float f1 = f_i(f_b, cell.x  , cell.y-1,  0, 1);
+    float f2 = f_i(f_b, cell.x-1, cell.y-1,  1, 1);
+    float f3 = f_i(f_b, cell.x+1, cell.y  , -1, 0);
+    float f4 = f_i(f_b, cell.x  , cell.y  ,  0, 0);
+    float f5 = f_i(f_b, cell.x-1, cell.y  ,  1, 0);
+    float f6 = f_i(f_b, cell.x+1, cell.y+1, -1,-1);
+    float f7 = f_i(f_b, cell.x  , cell.y+1,  0,-1);
+    float f8 = f_i(f_b, cell.x-1, cell.y+1,  1,-1);
+
+    const float d = f0 + f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8;
 
     const float2 v = (float2)(
         (f5 - f3 + f2 - f6 + f8 - f0) / d,
@@ -84,25 +79,25 @@ __kernel void collide_and_stream(__global       float* f_a,
     );
     const float dotv = dot(v,v);
 
-    f0 = (1.0 - $tau) * (f_i(f_b, cell.x+1, cell.y-1, -1, 1)) + $tau * f_eq(w[0], d,v,-1, 1, dotv);
-    f1 = (1.0 - $tau) * (f_i(f_b, cell.x  , cell.y-1,  0, 1)) + $tau * f_eq(w[1], d,v, 0, 1, dotv);
-    f2 = (1.0 - $tau) * (f_i(f_b, cell.x-1, cell.y-1,  1, 1)) + $tau * f_eq(w[2], d,v, 1, 1, dotv);
-    f3 = (1.0 - $tau) * (f_i(f_b, cell.x+1, cell.y  , -1, 0)) + $tau * f_eq(w[3], d,v,-1, 0, dotv);
-    f4 = (1.0 - $tau) * (f_i(f_b, cell.x  , cell.y  ,  0, 0)) + $tau * f_eq(w[4], d,v, 0, 0, dotv);
-    f5 = (1.0 - $tau) * (f_i(f_b, cell.x-1, cell.y  ,  1, 0)) + $tau * f_eq(w[5], d,v, 1, 0, dotv);
-    f6 = (1.0 - $tau) * (f_i(f_b, cell.x+1, cell.y+1, -1,-1)) + $tau * f_eq(w[6], d,v,-1,-1, dotv);
-    f7 = (1.0 - $tau) * (f_i(f_b, cell.x  , cell.y+1,  0,-1)) + $tau * f_eq(w[7], d,v, 0,-1, dotv);
-    f8 = (1.0 - $tau) * (f_i(f_b, cell.x-1, cell.y+1,  1,-1)) + $tau * f_eq(w[8], d,v, 1,-1, dotv);
-
-    f_a[0*$nX*$nY + gid] = f0;
-    f_a[1*$nX*$nY + gid] = f1;
-    f_a[2*$nX*$nY + gid] = f2;
-    f_a[3*$nX*$nY + gid] = f3;
-    f_a[4*$nX*$nY + gid] = f4;
-    f_a[5*$nX*$nY + gid] = f5;
-    f_a[6*$nX*$nY + gid] = f6;
-    f_a[7*$nX*$nY + gid] = f7;
-    f_a[8*$nX*$nY + gid] = f8;
+    f0 = (1.0f - $tau) * f0 + $tau * f_eq(w[0], d,v,-1, 1, dotv);
+    f1 = (1.0f - $tau) * f1 + $tau * f_eq(w[1], d,v, 0, 1, dotv);
+    f2 = (1.0f - $tau) * f2 + $tau * f_eq(w[2], d,v, 1, 1, dotv);
+    f3 = (1.0f - $tau) * f3 + $tau * f_eq(w[3], d,v,-1, 0, dotv);
+    f4 = (1.0f - $tau) * f4 + $tau * f_eq(w[4], d,v, 0, 0, dotv);
+    f5 = (1.0f - $tau) * f5 + $tau * f_eq(w[5], d,v, 1, 0, dotv);
+    f6 = (1.0f - $tau) * f6 + $tau * f_eq(w[6], d,v,-1,-1, dotv);
+    f7 = (1.0f - $tau) * f7 + $tau * f_eq(w[7], d,v, 0,-1, dotv);
+    f8 = (1.0f - $tau) * f8 + $tau * f_eq(w[8], d,v, 1,-1, dotv);
+
+    f_a[0*N_CELLS + gid] = f0;
+    f_a[1*N_CELLS + gid] = f1;
+    f_a[2*N_CELLS + gid] = f2;
+    f_a[3*N_CELLS + gid] = f3;
+    f_a[4*N_CELLS + gid] = f4;
+    f_a[5*N_CELLS + gid] = f5;
+    f_a[6*N_CELLS + gid] = f6;
+    f_a[7*N_CELLS + gid] = f7;
+    f_a[8*N_CELLS + gid] = f8;
 
     moments[1*gid] = d;
     moments[2*gid] = v.x;
@@ -177,18 +172,16 @@ class D2Q9_BGK_Lattice:
         self.program = cl.Program(self.context, Template(kernel).substitute({
             'nX' : self.nX,
             'nY' : self.nY,
-            'tau': 0.56
-        })).build()
+            'tau': "0.56f"
+        })).build() #'-cl-single-precision-constant -cl-fast-relaxed-math')
 
     def evolve(self):
         if self.tick:
             self.tick = False
-            self.program.collide_and_stream(self.queue, (self.nX,self.nY), (256,1), self.cl_pop_a, self.cl_pop_b, self.cl_moments, self.cl_material)
-            self.queue.finish()
+            self.program.collide_and_stream(self.queue, (self.nX,self.nY), (64,1), self.cl_pop_a, self.cl_pop_b, self.cl_moments, self.cl_material)
         else:
             self.tick = True
-            self.program.collide_and_stream(self.queue, (self.nX,self.nY), (256,1), self.cl_pop_b, self.cl_pop_a, self.cl_moments, self.cl_material)
-            self.queue.finish()
+            self.program.collide_and_stream(self.queue, (self.nX,self.nY), (64,1), self.cl_pop_b, self.cl_pop_a, self.cl_moments, self.cl_material)
 
     def show(self, i):
         cl.enqueue_copy(LBM.queue, LBM.np_moments, LBM.cl_moments).wait();
@@ -203,18 +196,22 @@ class D2Q9_BGK_Lattice:
 
 
 def MLUPS(cells, steps, time):
-    return ((cells*steps) / time) / 1000000
+    return cells * steps / time * 1e-6
 
 LBM = D2Q9_BGK_Lattice(1024, 1024)
 
 nUpdates = 1000
 
-start = timer()
+start = time.time()
 
 for i in range(0,nUpdates):
     LBM.evolve()
 
-end = timer()
+LBM.queue.finish()
+
+end = time.time()
+
+LBM.show(nUpdates)
 
 runtime = end - start
 
@@ -222,5 +219,3 @@ print("Cells:   " + str(LBM.nCells))
 print("Updates: " + str(nUpdates))
 print("Time:    " + str(runtime))
 print("MLUPS:   " + str(MLUPS(LBM.nCells, nUpdates, end - start)))
-
-LBM.show(nUpdates)
diff --git a/shell.nix b/shell.nix
index c431426..0022724 100644
--- a/shell.nix
+++ b/shell.nix
@@ -23,6 +23,7 @@ pkgs.stdenvNoCC.mkDerivation rec {
 
     local-python = custom-python.withPackages (python-packages: with python-packages; [
       numpy
+      sympy
       pyopencl
       pyopengl
       pygobject3
-- 
cgit v1.2.3