From b3acd514c5d629781e816b847aff9891015fa7bd Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Sat, 29 Jun 2019 23:45:47 +0200 Subject: Implement layout and memory padding There are at least two distinct areas where padding can be beneficial on a GPU: 1. Padding the global thread sizes to support specific thread layouts e.g. (32,1) layouts require the global lattice width to be a multiple of 32 2. Padding the memory layout at the lowest level to align memory accesses i.e. some GPUs read memory in 128 Byte chunks and as such it is beneficial if the operations are aligned accordingly For lattice and thread layout sizes that are exponents of two these two padding areas are equivalent. However when one operates on e.g. a (300,300) lattice using a (30,1) layout, padding to 128 bytes yields a performance improvement of about 10 MLUPS on a K2200. Note that I am getting quite unsatisfied with how the Lattice class and its suroundings continue to accumulate parameters. The naming distinction between Geometry, Grid, Memory and Lattice is also not very intuitive. --- template/kernel.mako | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'template') diff --git a/template/kernel.mako b/template/kernel.mako index 41edcbf..ceb7a7a 100644 --- a/template/kernel.mako +++ b/template/kernel.mako @@ -1,12 +1,12 @@ <% def gid(): return { - 2: 'get_global_id(1)*%d + get_global_id(0)' % geometry.size_x, - 3: 'get_global_id(2)*%d + get_global_id(1)*%d + get_global_id(0)' % (geometry.size_x*geometry.size_y, geometry.size_x) + 2: 'get_global_id(1)*%d + get_global_id(0)' % memory.size_x, + 3: 'get_global_id(2)*%d + get_global_id(1)*%d + get_global_id(0)' % (memory.size_x*memory.size_y, memory.size_x) }.get(descriptor.d) def pop_offset(i): - return i * geometry.volume + return i * memory.volume %> __kernel void equilibrilize(__global __write_only ${float_type}* f_next, @@ -30,8 +30,8 @@ __kernel void equilibrilize(__global __write_only ${float_type}* f_next, <% def neighbor_offset(c_i): return { - 2: lambda: c_i[1]*geometry.size_x + c_i[0], - 3: lambda: c_i[2]*geometry.size_x*geometry.size_y + c_i[1]*geometry.size_x + c_i[0] + 2: lambda: c_i[1]*memory.size_x + c_i[0], + 3: lambda: c_i[2]*memory.size_x*memory.size_y + c_i[1]*memory.size_x + c_i[0] }.get(descriptor.d)() %> -- cgit v1.2.3