From f0e2a04dbe47f40df537456acb1ead4bd5dd72df Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Thu, 30 May 2019 15:14:22 +0200 Subject: Collapse SOA into single array Weirdly the expected performance gains due to better coalescence of memory access is not achieved. --- implosion.py | 275 +++++++++++++++++------------------------------------------ 1 file changed, 78 insertions(+), 197 deletions(-) (limited to 'implosion.py') diff --git a/implosion.py b/implosion.py index d39a2ff..75b2fee 100644 --- a/implosion.py +++ b/implosion.py @@ -16,19 +16,28 @@ float constant w[9] = { 1./36 , 1./9., 1./36. }; -uint2 cellAtGid(unsigned int gid) -{ - const int y = gid / $nX; - return (uint2)(gid - $nX*y, y); +unsigned int indexOfDirection(int i, int j) { + return (i+1) + 3*(1-j); } -unsigned int gidOfCell(int x, int y) +unsigned int indexOfCell(int x, int y) { return y * $nX + x; } -unsigned int indexOfDirection(int i, int j) { - return 3*(i+1) + (j+1); +unsigned int idx(int x, int y, int i, int j) { + return indexOfDirection(i,j)*$nX*$nY + indexOfCell(x,y); +} + +uint2 cellAtIndex(unsigned int gid) +{ + const int y = gid / $nX; + return (uint2)(gid - $nX*y, y); +} + + +__global float* f_i(__global float* f, int x, int y, int i, int j) { + return f + idx(x,y,i,j); } float comp(int i, int j, float2 v) { @@ -39,37 +48,37 @@ float sq(float x) { return x*x; } -float equilibrium(float d, float2 v, int i, int j) { - return w[indexOfDirection(i,j)] * d * (1 + 3*comp(i,j,v) + 4.5*sq(comp(i,j,v)) - 1.5*dot(v,v)); +float density(__global const float* f, unsigned int gid) { + return f[0*$nX*$nY + gid] + + f[1*$nX*$nY + gid] + + f[2*$nX*$nY + gid] + + f[3*$nX*$nY + gid] + + f[4*$nX*$nY + gid] + + f[5*$nX*$nY + gid] + + f[6*$nX*$nY + gid] + + f[7*$nX*$nY + gid] + + f[8*$nX*$nY + gid]; +} + +float2 velocity(__global const float* f, float d, unsigned int gid) +{ + return (float2)( + (f[5*$nX*$nY+gid] - f[3*$nX*$nY+gid] + f[2*$nX*$nY+gid] - f[6*$nX*$nY+gid] + f[8*$nX*$nY+gid] - f[0*$nX*$nY+gid]) / d, + (f[1*$nX*$nY+gid] - f[7*$nX*$nY+gid] + f[2*$nX*$nY+gid] - f[6*$nX*$nY+gid] - f[8*$nX*$nY+gid] + f[0*$nX*$nY+gid]) / d + ); } -float bgk(__global const float* pop, uint ngid, int i, int j, float d, float2 v) { - return pop[ngid] + $tau * (equilibrium(d,v,i,j) - pop[ngid]); +float f_eq(float d, float2 v, int i, int j) { + return w[indexOfDirection(i,j)] * d * (1 + 3*comp(i,j,v) + 4.5*sq(comp(i,j,v)) - 1.5*dot(v,v)); } -__kernel void collide_and_stream(__global float* pop_a_0, - __global float* pop_a_1, - __global float* pop_a_2, - __global float* pop_a_3, - __global float* pop_a_4, - __global float* pop_a_5, - __global float* pop_a_6, - __global float* pop_a_7, - __global float* pop_a_8, - __global const float* pop_b_0, - __global const float* pop_b_1, - __global const float* pop_b_2, - __global const float* pop_b_3, - __global const float* pop_b_4, - __global const float* pop_b_5, - __global const float* pop_b_6, - __global const float* pop_b_7, - __global const float* pop_b_8, - __global float* moments, +__kernel void collide_and_stream(__global float* f_a, + __global const float* f_b, + __global float* moments, __global const int* material) { const unsigned int gid = get_global_id(0); - const uint2 cell = cellAtGid(gid); + const uint2 cell = cellAtIndex(gid); const int m = material[gid]; @@ -77,42 +86,19 @@ __kernel void collide_and_stream(__global float* pop_a_0, return; } - const float d = pop_b_0[gid] + pop_b_1[gid] + pop_b_2[gid] + pop_b_3[gid] + pop_b_4[gid] + pop_b_5[gid] + pop_b_6[gid] + pop_b_7[gid] + pop_b_8[gid]; - - const float2 v = (float2)( - (pop_b_5[gid] - pop_b_3[gid] + pop_b_2[gid] - pop_b_6[gid] + pop_b_8[gid] - pop_b_0[gid]) / d, - (pop_b_1[gid] - pop_b_7[gid] + pop_b_2[gid] - pop_b_6[gid] - pop_b_8[gid] + pop_b_0[gid]) / d - ); + const float d = density(f_b, gid); + const float2 v = velocity(f_b, d, gid); - if ( m == 1 ) { - pop_a_0[gid] = bgk(pop_b_0, gidOfCell(cell.x+1, cell.y-1), -1, 1, d, v); - pop_a_1[gid] = bgk(pop_b_1, gidOfCell(cell.x , cell.y-1), 0, 1, d, v); - pop_a_2[gid] = bgk(pop_b_2, gidOfCell(cell.x-1, cell.y-1), 1, 1, d, v); - - pop_a_3[gid] = bgk(pop_b_3, gidOfCell(cell.x+1, cell.y ), -1, 0, d, v); - pop_a_4[gid] = bgk(pop_b_4, gidOfCell(cell.x , cell.y ), 0, 0, d, v); - pop_a_5[gid] = bgk(pop_b_5, gidOfCell(cell.x-1, cell.y ), 1, 0, d, v); - - pop_a_6[gid] = bgk(pop_b_6, gidOfCell(cell.x+1, cell.y+1), -1,-1, d, v); - pop_a_7[gid] = bgk(pop_b_7, gidOfCell(cell.x , cell.y+1), 0,-1, d, v); - pop_a_8[gid] = bgk(pop_b_8, gidOfCell(cell.x-1, cell.y+1), 1,-1, d, v); - } else { - pop_a_8[gid] = bgk(pop_b_0, gidOfCell(cell.x+1, cell.y-1), -1, 1, d, v); - pop_a_7[gid] = bgk(pop_b_1, gidOfCell(cell.x , cell.y-1), 0, 1, d, v); - pop_a_6[gid] = bgk(pop_b_2, gidOfCell(cell.x-1, cell.y-1), 1, 1, d, v); - - pop_a_5[gid] = bgk(pop_b_3, gidOfCell(cell.x+1, cell.y ), -1, 0, d, v); - pop_a_4[gid] = bgk(pop_b_4, gidOfCell(cell.x , cell.y ), 0, 0, d, v); - pop_a_3[gid] = bgk(pop_b_5, gidOfCell(cell.x-1, cell.y ), 1, 0, d, v); - - pop_a_2[gid] = bgk(pop_b_6, gidOfCell(cell.x+1, cell.y+1), -1,-1, d, v); - pop_a_1[gid] = bgk(pop_b_7, gidOfCell(cell.x , cell.y+1), 0,-1, d, v); - pop_a_0[gid] = bgk(pop_b_8, gidOfCell(cell.x-1, cell.y+1), 1,-1, d, v); + for ( int i = -1; i <= 1; ++i ) { + for ( int j = 1; j >= -1; --j ) { + *f_i(f_a, cell.x, cell.y, m*i, m*j) = *f_i(f_b, cell.x-i, cell.y-j, i, j) + + $tau * (f_eq(d,v,i,j) - *f_i(f_b, cell.x-i, cell.y-j, i, j)); + } } - moments[gid*3+0] = d; - moments[gid*3+1] = v.x; - moments[gid*3+2] = v.y; + moments[gid] = d; + moments[2*gid] = v.x; + moments[3*gid] = v.y; }""" class D2Q9_BGK_Lattice: @@ -129,27 +115,10 @@ class D2Q9_BGK_Lattice: self.context = cl.Context(properties=[(cl.context_properties.PLATFORM, self.platform)]) self.queue = cl.CommandQueue(self.context) - self.np_pop_a_0 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_1 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_2 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_3 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_4 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_5 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_6 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_7 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_a_8 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - - self.np_pop_b_0 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_1 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_2 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_3 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_4 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_5 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_6 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_7 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - self.np_pop_b_8 = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.float32) - - self.np_moments = numpy.ndarray(shape=(self.nCells, 3), dtype=numpy.float32) + self.np_pop_a = numpy.ndarray(shape=(9, self.nCells), dtype=numpy.float32) + self.np_pop_b = numpy.ndarray(shape=(9, self.nCells), dtype=numpy.float32) + + self.np_moments = numpy.ndarray(shape=(3, self.nCells), dtype=numpy.float32) self.np_material = numpy.ndarray(shape=(self.nCells, 1), dtype=numpy.int32) self.setup_geometry() @@ -157,28 +126,11 @@ class D2Q9_BGK_Lattice: self.equilibrilize() self.setup_anomaly() - self.cl_pop_a_0 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_0) - self.cl_pop_a_1 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_1) - self.cl_pop_a_2 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_2) - self.cl_pop_a_3 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_3) - self.cl_pop_a_4 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_4) - self.cl_pop_a_5 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_5) - self.cl_pop_a_6 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_6) - self.cl_pop_a_7 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_7) - self.cl_pop_a_8 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a_8) - - self.cl_pop_b_0 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_0) - self.cl_pop_b_1 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_1) - self.cl_pop_b_2 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_2) - self.cl_pop_b_3 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_3) - self.cl_pop_b_4 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_4) - self.cl_pop_b_5 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_5) - self.cl_pop_b_6 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_6) - self.cl_pop_b_7 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_7) - self.cl_pop_b_8 = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b_8) - - self.cl_moments = cl.Buffer(self.context, mf.WRITE_ONLY | mf.USE_HOST_PTR, hostbuf=self.np_moments) - self.cl_material = cl.Buffer(self.context, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=self.np_material) + self.cl_pop_a = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_a) + self.cl_pop_b = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_pop_b) + + self.cl_material = cl.Buffer(self.context, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=self.np_material) + self.cl_moments = cl.Buffer(self.context, mf.READ_WRITE | mf.USE_HOST_PTR, hostbuf=self.np_moments) self.build_kernel() @@ -192,26 +144,13 @@ class D2Q9_BGK_Lattice: self.np_material[self.idx(x,y)] = 1 def equilibrilize(self): - self.np_pop_a_0[:] = 1./36. - self.np_pop_a_1[:] = 1./9. - self.np_pop_a_2[:] = 1./36. - self.np_pop_a_3[:] = 1./9. - self.np_pop_a_4[:] = 4./9. - self.np_pop_a_5[:] = 1./9. - self.np_pop_a_6[:] = 1./36 - self.np_pop_a_7[:] = 1./9. - self.np_pop_a_8[:] = 1./36. - - self.np_pop_b_0[:] = 1./36. - self.np_pop_b_1[:] = 1./9. - self.np_pop_b_2[:] = 1./36. - self.np_pop_b_3[:] = 1./9. - self.np_pop_b_4[:] = 4./9. - self.np_pop_b_5[:] = 1./9. - self.np_pop_b_6[:] = 1./36 - self.np_pop_b_7[:] = 1./9. - self.np_pop_b_8[:] = 1./36. + self.np_pop_a[(0,2,6,8),:] = 1./36. + self.np_pop_a[(1,3,5,7),:] = 1./9. + self.np_pop_a[4,:] = 4./9. + self.np_pop_b[(0,2,6,8),:] = 1./36. + self.np_pop_b[(1,3,5,7),:] = 1./9. + self.np_pop_b[4,:] = 4./9. def setup_anomaly(self): bubbles = [ [ self.nX//4, self.nY//4], @@ -223,25 +162,8 @@ class D2Q9_BGK_Lattice: for y in range(0,self.nY-1): for [a,b] in bubbles: if numpy.sqrt((x-a)*(x-a)+(y-b)*(y-b)) < self.nX//10: - self.np_pop_a_0[self.idx(x,y)] = 1./24. - self.np_pop_a_1[self.idx(x,y)] = 1./24. - self.np_pop_a_2[self.idx(x,y)] = 1./24. - self.np_pop_a_3[self.idx(x,y)] = 1./24. - self.np_pop_a_4[self.idx(x,y)] = 1./24. - self.np_pop_a_5[self.idx(x,y)] = 1./24. - self.np_pop_a_6[self.idx(x,y)] = 1./24. - self.np_pop_a_7[self.idx(x,y)] = 1./24. - self.np_pop_a_8[self.idx(x,y)] = 1./24. - - self.np_pop_b_0[self.idx(x,y)] = 1./24. - self.np_pop_b_1[self.idx(x,y)] = 1./24. - self.np_pop_b_2[self.idx(x,y)] = 1./24. - self.np_pop_b_3[self.idx(x,y)] = 1./24. - self.np_pop_b_4[self.idx(x,y)] = 1./24. - self.np_pop_b_5[self.idx(x,y)] = 1./24. - self.np_pop_b_6[self.idx(x,y)] = 1./24. - self.np_pop_b_7[self.idx(x,y)] = 1./24. - self.np_pop_b_8[self.idx(x,y)] = 1./24. + self.np_pop_a[:,self.idx(x,y)] = 1./24. + self.np_pop_b[:,self.idx(x,y)] = 1./24. def build_kernel(self): self.program = cl.Program(self.context, Template(kernel).substitute({ @@ -253,76 +175,35 @@ class D2Q9_BGK_Lattice: def evolve(self): if self.tick: self.tick = False - self.program.collide_and_stream(self.queue, (self.nCells,), None, - self.cl_pop_a_0, - self.cl_pop_a_1, - self.cl_pop_a_2, - self.cl_pop_a_3, - self.cl_pop_a_4, - self.cl_pop_a_5, - self.cl_pop_a_6, - self.cl_pop_a_7, - self.cl_pop_a_8, - self.cl_pop_b_0, - self.cl_pop_b_1, - self.cl_pop_b_2, - self.cl_pop_b_3, - self.cl_pop_b_4, - self.cl_pop_b_5, - self.cl_pop_b_6, - self.cl_pop_b_7, - self.cl_pop_b_8, - self.cl_moments, - self.cl_material) + self.program.collide_and_stream(self.queue, (self.nCells,), None, self.cl_pop_a, self.cl_pop_b, self.cl_moments, self.cl_material) self.queue.finish() else: self.tick = True - self.program.collide_and_stream(self.queue, (self.nCells,), None, - self.cl_pop_b_0, - self.cl_pop_b_1, - self.cl_pop_b_2, - self.cl_pop_b_3, - self.cl_pop_b_4, - self.cl_pop_b_5, - self.cl_pop_b_6, - self.cl_pop_b_7, - self.cl_pop_b_8, - self.cl_pop_a_0, - self.cl_pop_a_1, - self.cl_pop_a_2, - self.cl_pop_a_3, - self.cl_pop_a_4, - self.cl_pop_a_5, - self.cl_pop_a_6, - self.cl_pop_a_7, - self.cl_pop_a_8, - self.cl_moments, - self.cl_material) + self.program.collide_and_stream(self.queue, (self.nCells,), None, self.cl_pop_b, self.cl_pop_a, self.cl_moments, self.cl_material) self.queue.finish() def show(self, i): - cl.enqueue_copy(self.queue, self.np_moments, self.cl_moments).wait(); - - density = numpy.ndarray(shape=(self.nX, self.nY)) + cl.enqueue_copy(LBM.queue, LBM.np_moments, LBM.cl_moments).wait(); - for y in range(0,self.nY-1): - for x in range(0,self.nX-1): - density[x,y] = self.np_moments[self.idx(x,y),0] + density = numpy.ndarray(shape=(self.nX-2, self.nY-2)) + for y in range(1,self.nY-1): + for x in range(1,self.nX-1): + density[x-1,y-1] = self.np_moments[0,self.idx(x,y)] - plt.imshow(density, vmin=0.2, vmax=2, cmap=plt.get_cmap("seismic")) + plt.imshow(density, vmin=0.2, vmax=2.0, cmap=plt.get_cmap("seismic")) plt.savefig("result/density_" + str(i) + ".png") def MLUPS(cells, steps, time): return ((cells*steps) / time) / 1000000 -LBM = D2Q9_BGK_Lattice(1000, 1000) +LBM = D2Q9_BGK_Lattice(2000, 2000) -nUpdates = 10000 +nUpdates = 100 start = timer() -for i in range(1,nUpdates): +for i in range(0,nUpdates): LBM.evolve() end = timer() -- cgit v1.2.3