aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Kummerlaender2019-06-08 23:08:28 +0200
committerAdrian Kummerlaender2019-06-08 23:08:28 +0200
commit5ac924371a7e53641a2f726a9f431ab8cb99f9fb (patch)
tree7e62a64ca253c8c18a85c2131b141125eb115b33
parent4a6b0bb928db91d57eaa09b656d296e79eafe7ed (diff)
downloadsymlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.tar
symlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.tar.gz
symlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.tar.bz2
symlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.tar.lz
symlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.tar.xz
symlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.tar.zst
symlbm_playground-5ac924371a7e53641a2f726a9f431ab8cb99f9fb.zip
Performance optimizations
Starting point: ~200 MLUPS on a NVidia K2200 Changes that did not noticeably impact performance: * Memory layout AOS vs. SOA (weird, probably highly platform dependent) * Propagate on read * Tagging pointers as read / write only * Manual code inlining Changes that made things worse: * Bad thread block sizes The actual issue: * Hidden double precision computations => Code now yields ~600 MLUPS
-rw-r--r--implosion.py107
-rw-r--r--shell.nix1
2 files changed, 52 insertions, 56 deletions
diff --git a/implosion.py b/implosion.py
index 1fe0537..491e2ff 100644
--- a/implosion.py
+++ b/implosion.py
@@ -6,7 +6,7 @@ from string import Template
import numpy
import matplotlib.pyplot as plt
-from timeit import default_timer as timer
+import time
kernel = """
float constant w[9] = {
@@ -15,6 +15,8 @@ float constant w[9] = {
1./36 , 1./9., 1./36.
};
+#define N_CELLS $nX*$nY
+
unsigned int indexOfDirection(int i, int j) {
return (i+1) + 3*(1-j);
}
@@ -25,10 +27,10 @@ unsigned int indexOfCell(int x, int y)
}
unsigned int idx(int x, int y, int i, int j) {
- return indexOfDirection(i,j)*$nX*$nY + indexOfCell(x,y);
+ return indexOfDirection(i,j)*N_CELLS + indexOfCell(x,y);
}
-__global float f_i(__global const float* f, int x, int y, int i, int j) {
+__global float f_i(__global __read_only float* f, int x, int y, int i, int j) {
return f[idx(x,y,i,j)];
}
@@ -41,13 +43,13 @@ float sq(float x) {
}
float f_eq(float w, float d, float2 v, int i, int j, float dotv) {
- return w * d * (1 + 3*comp(i,j,v) + 4.5*sq(comp(i,j,v)) - 1.5*dotv);
+ return w * d * (1.f + 3.f*comp(i,j,v) + 4.5f*sq(comp(i,j,v)) - 1.5f*dotv);
}
-__kernel void collide_and_stream(__global float* f_a,
- __global const float* f_b,
- __global float* moments,
- __global const int* material)
+__kernel void collide_and_stream(__global __write_only float* f_a,
+ __global __read_only float* f_b,
+ __global __write_only float* moments,
+ __global __read_only int* material)
{
const unsigned int gid = indexOfCell(get_global_id(0), get_global_id(1));
@@ -59,24 +61,17 @@ __kernel void collide_and_stream(__global float* f_a,
return;
}
- float f0 = f_b[0*$nX*$nY + gid];
- float d = f0;
- float f1 = f_b[1*$nX*$nY + gid];
- d += f1;
- float f2 = f_b[2*$nX*$nY + gid];
- d += f2;
- float f3 = f_b[3*$nX*$nY + gid];
- d += f3;
- float f4 = f_b[4*$nX*$nY + gid];
- d += f4;
- float f5 = f_b[5*$nX*$nY + gid];
- d += f5;
- float f6 = f_b[6*$nX*$nY + gid];
- d += f6;
- float f7 = f_b[7*$nX*$nY + gid];
- d += f7;
- float f8 = f_b[8*$nX*$nY + gid];
- d += f8;
+ float f0 = f_i(f_b, cell.x+1, cell.y-1, -1, 1);
+ float f1 = f_i(f_b, cell.x , cell.y-1, 0, 1);
+ float f2 = f_i(f_b, cell.x-1, cell.y-1, 1, 1);
+ float f3 = f_i(f_b, cell.x+1, cell.y , -1, 0);
+ float f4 = f_i(f_b, cell.x , cell.y , 0, 0);
+ float f5 = f_i(f_b, cell.x-1, cell.y , 1, 0);
+ float f6 = f_i(f_b, cell.x+1, cell.y+1, -1,-1);
+ float f7 = f_i(f_b, cell.x , cell.y+1, 0,-1);
+ float f8 = f_i(f_b, cell.x-1, cell.y+1, 1,-1);
+
+ const float d = f0 + f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8;
const float2 v = (float2)(
(f5 - f3 + f2 - f6 + f8 - f0) / d,
@@ -84,25 +79,25 @@ __kernel void collide_and_stream(__global float* f_a,
);
const float dotv = dot(v,v);
- f0 = (1.0 - $tau) * (f_i(f_b, cell.x+1, cell.y-1, -1, 1)) + $tau * f_eq(w[0], d,v,-1, 1, dotv);
- f1 = (1.0 - $tau) * (f_i(f_b, cell.x , cell.y-1, 0, 1)) + $tau * f_eq(w[1], d,v, 0, 1, dotv);
- f2 = (1.0 - $tau) * (f_i(f_b, cell.x-1, cell.y-1, 1, 1)) + $tau * f_eq(w[2], d,v, 1, 1, dotv);
- f3 = (1.0 - $tau) * (f_i(f_b, cell.x+1, cell.y , -1, 0)) + $tau * f_eq(w[3], d,v,-1, 0, dotv);
- f4 = (1.0 - $tau) * (f_i(f_b, cell.x , cell.y , 0, 0)) + $tau * f_eq(w[4], d,v, 0, 0, dotv);
- f5 = (1.0 - $tau) * (f_i(f_b, cell.x-1, cell.y , 1, 0)) + $tau * f_eq(w[5], d,v, 1, 0, dotv);
- f6 = (1.0 - $tau) * (f_i(f_b, cell.x+1, cell.y+1, -1,-1)) + $tau * f_eq(w[6], d,v,-1,-1, dotv);
- f7 = (1.0 - $tau) * (f_i(f_b, cell.x , cell.y+1, 0,-1)) + $tau * f_eq(w[7], d,v, 0,-1, dotv);
- f8 = (1.0 - $tau) * (f_i(f_b, cell.x-1, cell.y+1, 1,-1)) + $tau * f_eq(w[8], d,v, 1,-1, dotv);
-
- f_a[0*$nX*$nY + gid] = f0;
- f_a[1*$nX*$nY + gid] = f1;
- f_a[2*$nX*$nY + gid] = f2;
- f_a[3*$nX*$nY + gid] = f3;
- f_a[4*$nX*$nY + gid] = f4;
- f_a[5*$nX*$nY + gid] = f5;
- f_a[6*$nX*$nY + gid] = f6;
- f_a[7*$nX*$nY + gid] = f7;
- f_a[8*$nX*$nY + gid] = f8;
+ f0 = (1.0f - $tau) * f0 + $tau * f_eq(w[0], d,v,-1, 1, dotv);
+ f1 = (1.0f - $tau) * f1 + $tau * f_eq(w[1], d,v, 0, 1, dotv);
+ f2 = (1.0f - $tau) * f2 + $tau * f_eq(w[2], d,v, 1, 1, dotv);
+ f3 = (1.0f - $tau) * f3 + $tau * f_eq(w[3], d,v,-1, 0, dotv);
+ f4 = (1.0f - $tau) * f4 + $tau * f_eq(w[4], d,v, 0, 0, dotv);
+ f5 = (1.0f - $tau) * f5 + $tau * f_eq(w[5], d,v, 1, 0, dotv);
+ f6 = (1.0f - $tau) * f6 + $tau * f_eq(w[6], d,v,-1,-1, dotv);
+ f7 = (1.0f - $tau) * f7 + $tau * f_eq(w[7], d,v, 0,-1, dotv);
+ f8 = (1.0f - $tau) * f8 + $tau * f_eq(w[8], d,v, 1,-1, dotv);
+
+ f_a[0*N_CELLS + gid] = f0;
+ f_a[1*N_CELLS + gid] = f1;
+ f_a[2*N_CELLS + gid] = f2;
+ f_a[3*N_CELLS + gid] = f3;
+ f_a[4*N_CELLS + gid] = f4;
+ f_a[5*N_CELLS + gid] = f5;
+ f_a[6*N_CELLS + gid] = f6;
+ f_a[7*N_CELLS + gid] = f7;
+ f_a[8*N_CELLS + gid] = f8;
moments[1*gid] = d;
moments[2*gid] = v.x;
@@ -177,18 +172,16 @@ class D2Q9_BGK_Lattice:
self.program = cl.Program(self.context, Template(kernel).substitute({
'nX' : self.nX,
'nY' : self.nY,
- 'tau': 0.56
- })).build()
+ 'tau': "0.56f"
+ })).build() #'-cl-single-precision-constant -cl-fast-relaxed-math')
def evolve(self):
if self.tick:
self.tick = False
- self.program.collide_and_stream(self.queue, (self.nX,self.nY), (256,1), self.cl_pop_a, self.cl_pop_b, self.cl_moments, self.cl_material)
- self.queue.finish()
+ self.program.collide_and_stream(self.queue, (self.nX,self.nY), (64,1), self.cl_pop_a, self.cl_pop_b, self.cl_moments, self.cl_material)
else:
self.tick = True
- self.program.collide_and_stream(self.queue, (self.nX,self.nY), (256,1), self.cl_pop_b, self.cl_pop_a, self.cl_moments, self.cl_material)
- self.queue.finish()
+ self.program.collide_and_stream(self.queue, (self.nX,self.nY), (64,1), self.cl_pop_b, self.cl_pop_a, self.cl_moments, self.cl_material)
def show(self, i):
cl.enqueue_copy(LBM.queue, LBM.np_moments, LBM.cl_moments).wait();
@@ -203,18 +196,22 @@ class D2Q9_BGK_Lattice:
def MLUPS(cells, steps, time):
- return ((cells*steps) / time) / 1000000
+ return cells * steps / time * 1e-6
LBM = D2Q9_BGK_Lattice(1024, 1024)
nUpdates = 1000
-start = timer()
+start = time.time()
for i in range(0,nUpdates):
LBM.evolve()
-end = timer()
+LBM.queue.finish()
+
+end = time.time()
+
+LBM.show(nUpdates)
runtime = end - start
@@ -222,5 +219,3 @@ print("Cells: " + str(LBM.nCells))
print("Updates: " + str(nUpdates))
print("Time: " + str(runtime))
print("MLUPS: " + str(MLUPS(LBM.nCells, nUpdates, end - start)))
-
-LBM.show(nUpdates)
diff --git a/shell.nix b/shell.nix
index c431426..0022724 100644
--- a/shell.nix
+++ b/shell.nix
@@ -23,6 +23,7 @@ pkgs.stdenvNoCC.mkDerivation rec {
local-python = custom-python.withPackages (python-packages: with python-packages; [
numpy
+ sympy
pyopencl
pyopengl
pygobject3