diff options
author | Adrian Kummerlaender | 2019-11-10 21:14:07 +0100 |
---|---|---|
committer | Adrian Kummerlaender | 2019-11-10 21:18:57 +0100 |
commit | 4a2885ad3ae0396486d288df94339d0c45e6db8b (patch) | |
tree | 1a0b5aa000bbcde65fa020381a02b19bb452e284 | |
parent | d136bb30bc8a9393372ec905aea500a0b61000e3 (diff) | |
download | boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.gz boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.bz2 boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.lz boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.xz boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.zst boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.zip |
Implement basic CUDA target
Currently only for the SSS streaming pattern.
CudaCodePrinter in `utility/printer.py` is required to add a 'f' suffix
to all single precision floating point literals. If this is not done
(when targeting single precision) most calculations happen in double
precision which destroys performance. (In OpenCL this is not necessary
as we can simply set the `-cl-single-precision-constant` flag. Sadly
such a flag doesn't seem to exist for nvcc.)
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | boltzgen/kernel/memory/precision.py | 3 | ||||
-rw-r--r-- | boltzgen/kernel/template/bounce_back_boundary.cuda.mako | 16 | ||||
-rw-r--r-- | boltzgen/kernel/template/collect_moments.cuda.mako | 20 | ||||
-rw-r--r-- | boltzgen/kernel/template/collide_and_stream.cuda.mako | 17 | ||||
-rw-r--r-- | boltzgen/kernel/template/equilibrilize.cuda.mako | 11 | ||||
-rw-r--r-- | boltzgen/kernel/template/momenta_boundary.cuda.mako | 39 | ||||
-rw-r--r-- | boltzgen/kernel/template/pattern/SSS.cuda.mako | 81 | ||||
-rw-r--r-- | boltzgen/kernel/template/update_sss_control_structure.cpp.mako | 5 | ||||
-rw-r--r-- | boltzgen/kernel/template/update_sss_control_structure.cuda.mako | 21 | ||||
-rw-r--r-- | boltzgen/utility/__init__.py | 1 | ||||
-rw-r--r-- | boltzgen/utility/printer.py | 13 |
12 files changed, 224 insertions, 5 deletions
@@ -15,7 +15,7 @@ At the moment this is a more structured and cleaned up version of the OpenCL ker * configurable cell indexing sequence * static resolution of memory offsets * AB, AA and SSS streaming patterns -* C++ and OpenCL targets +* C++, OpenCL and CUDA targets * simple CLI frontend ## Usage diff --git a/boltzgen/kernel/memory/precision.py b/boltzgen/kernel/memory/precision.py index d22e922..0c52fc0 100644 --- a/boltzgen/kernel/memory/precision.py +++ b/boltzgen/kernel/memory/precision.py @@ -14,3 +14,6 @@ class cpp(common): class cl(common): pass + +class cuda(common): + pass diff --git a/boltzgen/kernel/template/bounce_back_boundary.cuda.mako b/boltzgen/kernel/template/bounce_back_boundary.cuda.mako new file mode 100644 index 0000000..09499e2 --- /dev/null +++ b/boltzgen/kernel/template/bounce_back_boundary.cuda.mako @@ -0,0 +1,16 @@ +<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/> +<% +from boltzgen.utility.printer import CudaCodePrinter +ccode = CudaCodePrinter(float_type).doprint +subexpr, assignment = model.collision(f_eq = model.equilibrium(resolve_moments = True)) +%> + +<%call expr="pattern.operator('bounce_back_boundary')"> +% for i, expr in enumerate(subexpr): + const ${float_type} ${expr[0]} = ${ccode(expr[1])}; +% endfor + +% for i, c_i in enumerate(descriptor.c): + const ${float_type} ${assignment[i].lhs} = ${ccode(assignment[descriptor.c.index(-c_i)].rhs)}; +% endfor +</%call> diff --git a/boltzgen/kernel/template/collect_moments.cuda.mako b/boltzgen/kernel/template/collect_moments.cuda.mako new file mode 100644 index 0000000..36f03b0 --- /dev/null +++ b/boltzgen/kernel/template/collect_moments.cuda.mako @@ -0,0 +1,20 @@ +<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/> +<% +from boltzgen.utility.printer import CudaCodePrinter +ccode = CudaCodePrinter(float_type).doprint +moments_subexpr, moments_assignment = model.moments() +%> + +<%call expr="pattern.functor('collect_moments', [('%s*' % float_type, 'rho'), ('%s*' % float_type, 'u')])"> +% for i, expr in enumerate(moments_subexpr): + const ${float_type} ${expr[0]} = ${ccode(expr[1])}; +% endfor + +% for i, expr in enumerate(moments_assignment): +% if i == 0: + rho[gid] = ${ccode(expr.rhs)}; +% else: + u[gid*${descriptor.d} + ${i-1}] = ${ccode(expr.rhs)}; +% endif +% endfor +</%call> diff --git a/boltzgen/kernel/template/collide_and_stream.cuda.mako b/boltzgen/kernel/template/collide_and_stream.cuda.mako new file mode 100644 index 0000000..b46cf0f --- /dev/null +++ b/boltzgen/kernel/template/collide_and_stream.cuda.mako @@ -0,0 +1,17 @@ +<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/> +<% +from boltzgen.utility.printer import CudaCodePrinter +ccode = CudaCodePrinter(float_type).doprint +subexpr, assignment = model.collision(f_eq = model.equilibrium(resolve_moments = True)) + +%> + +<%call expr="pattern.operator('collide_and_stream')"> +% for i, expr in enumerate(subexpr): + const ${float_type} ${expr[0]} = ${ccode(expr[1])}; +% endfor + +% for i, expr in enumerate(assignment): + const ${float_type} ${ccode(expr)} +% endfor +</%call> diff --git a/boltzgen/kernel/template/equilibrilize.cuda.mako b/boltzgen/kernel/template/equilibrilize.cuda.mako new file mode 100644 index 0000000..2f4cdee --- /dev/null +++ b/boltzgen/kernel/template/equilibrilize.cuda.mako @@ -0,0 +1,11 @@ +<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/> +<% +from boltzgen.utility.printer import CudaCodePrinter +ccode = CudaCodePrinter(float_type).doprint +%> + +<%call expr="pattern.operator('equilibrilize')"> +% for i, w_i in enumerate(descriptor.w): + const ${float_type} f_next_${i} = ${ccode(w_i.evalf())}; +% endfor +</%call> diff --git a/boltzgen/kernel/template/momenta_boundary.cuda.mako b/boltzgen/kernel/template/momenta_boundary.cuda.mako new file mode 100644 index 0000000..408a12d --- /dev/null +++ b/boltzgen/kernel/template/momenta_boundary.cuda.mako @@ -0,0 +1,39 @@ +<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/> +<% +from boltzgen.utility.printer import CudaCodePrinter +ccode = CudaCodePrinter(float_type).doprint +moments_subexpr, moments_assignment = model.moments() +collision_subexpr, collision_assignment = model.collision(f_eq = model.equilibrium(resolve_moments = False)) +%> + +<%def name="momenta_boundary(name, params)"> +<%call expr="pattern.operator('%s_momenta_boundary' % name, params)"> +% for i, expr in enumerate(moments_subexpr): + const ${float_type} ${expr[0]} = ${ccode(expr[1])}; +% endfor + + ${caller.body()} + +% for i, expr in enumerate(collision_subexpr): + const ${float_type} ${expr[0]} = ${ccode(expr[1])}; +% endfor + +% for i, expr in enumerate(collision_assignment): + const ${float_type} ${ccode(expr)} +% endfor +</%call> +</%def> + +<%call expr="momenta_boundary('velocity', list(map(lambda i: (float_type, 'velocity_%d' % i), range(descriptor.d))))"> + const ${float_type} ${ccode(moments_assignment[0])} +% for i, expr in enumerate(moments_assignment[1:]): + const ${float_type} ${expr.lhs} = velocity_${i}; +% endfor +</%call> + +<%call expr="momenta_boundary('density', [(float_type, 'density')])"> + const ${float_type} ${moments_assignment[0].lhs} = density; +% for i, expr in enumerate(moments_assignment[1:]): + const ${float_type} ${ccode(expr)} +% endfor +</%call> diff --git a/boltzgen/kernel/template/pattern/SSS.cuda.mako b/boltzgen/kernel/template/pattern/SSS.cuda.mako new file mode 100644 index 0000000..8f571c7 --- /dev/null +++ b/boltzgen/kernel/template/pattern/SSS.cuda.mako @@ -0,0 +1,81 @@ +<%def name="operator(name, params = None)"> +<% +if layout.__class__.__name__ != 'SOA': + raise Exception('SSS pattern only works for the AOS memory layout') +%> +__global__ void ${name}( + ${float_type}** f +% if 'cell_list_dispatch' in extras: + , std::size_t* cells + , std::size_t cell_count +% else: + , std::size_t gid +% endif +% if params is not None: +% for param_type, param_name in params: + , ${param_type} ${param_name} +% endfor +% endif +) { +% if 'cell_list_dispatch' in extras: + const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (!(index < cell_count)) { + return; + } + const std::size_t gid = cells[index]; +% endif + +% for i, c_i in enumerate(descriptor.c): + ${float_type}* preshifted_f_${i} = f[${i}] + ${layout.cell_preshift('gid')}; +% endfor + +% for i, c_i in enumerate(descriptor.c): + const ${float_type} f_curr_${i} = *preshifted_f_${i}; +% endfor + + ${caller.body()} + +% for i, c_i in enumerate(descriptor.c): + *preshifted_f_${i} = f_next_${descriptor.c.index(-c_i)}; +% endfor +} +</%def> + +<%def name="functor(name, params = None)"> +<% +if layout.__class__.__name__ != 'SOA': + raise Exception('SSS pattern only works for the AOS memory layout') +%> +__global__ void ${name}( + ${float_type}** f +% if 'cell_list_dispatch' in extras: + , std::size_t* cells + , std::size_t cell_count +% else: + , std::size_t gid +% endif +% if params is not None: +% for param_type, param_name in params: + , ${param_type} ${param_name} +% endfor +% endif +) { +% if 'cell_list_dispatch' in extras: + const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (!(index < cell_count)) { + return; + } + const std::size_t gid = cells[index]; +% endif + +% for i, c_i in enumerate(descriptor.c): + const ${float_type}* preshifted_f_${i} = f[${i}] + ${layout.cell_preshift('gid')}; +% endfor + +% for i, c_i in enumerate(descriptor.c): + const ${float_type} f_curr_${i} = *preshifted_f_${descriptor.c.index(-c_i)}; +% endfor + + ${caller.body()} +} +</%def> diff --git a/boltzgen/kernel/template/update_sss_control_structure.cpp.mako b/boltzgen/kernel/template/update_sss_control_structure.cpp.mako index 9197022..b7661a3 100644 --- a/boltzgen/kernel/template/update_sss_control_structure.cpp.mako +++ b/boltzgen/kernel/template/update_sss_control_structure.cpp.mako @@ -7,9 +7,6 @@ void update_sss_control_structure(${float_type}** f) { ${float_type}* f_old_${i} = f[${i}]; % endfor % for i, c_i in enumerate(descriptor.c): - f[${i}] = f_old_${descriptor.c.index(-c_i)}; -% endfor -% for i, c_i in enumerate(descriptor.c): - f[${i}] += ${layout.neighbor_offset(-c_i)}; + f[${i}] = f_old_${descriptor.c.index(-c_i)} + ${layout.neighbor_offset(-c_i)}; % endfor } diff --git a/boltzgen/kernel/template/update_sss_control_structure.cuda.mako b/boltzgen/kernel/template/update_sss_control_structure.cuda.mako new file mode 100644 index 0000000..2a98b94 --- /dev/null +++ b/boltzgen/kernel/template/update_sss_control_structure.cuda.mako @@ -0,0 +1,21 @@ +<% +if streaming != 'SSS': + raise Exception('"update_sss_control_structure" function only makes sense for the SSS pattern') + +padding = (max(geometry.size_x,geometry.size_y,geometry.size_z)+1)**(descriptor.d-1) +%> + +__global__ void init_sss_control_structure(${float_type}* f, ${float_type}** control) { +% for i, c_i in enumerate(descriptor.c): + control[${i}] = f + ${padding + layout.pop_offset(i, 2*padding)}; +% endfor +} + +__global__ void update_sss_control_structure(${float_type}** f) { +% for i, c_i in enumerate(descriptor.c): + ${float_type}* f_old_${i} = f[${i}]; +% endfor +% for i, c_i in enumerate(descriptor.c): + f[${i}] = f_old_${descriptor.c.index(-c_i)} + ${layout.neighbor_offset(-c_i)}; +% endfor +} diff --git a/boltzgen/utility/__init__.py b/boltzgen/utility/__init__.py index fa9c760..5905c36 100644 --- a/boltzgen/utility/__init__.py +++ b/boltzgen/utility/__init__.py @@ -1,5 +1,6 @@ from . import optimizations from . import ndindex +from . import printer from sympy.codegen.ast import Assignment diff --git a/boltzgen/utility/printer.py b/boltzgen/utility/printer.py new file mode 100644 index 0000000..b52a5e0 --- /dev/null +++ b/boltzgen/utility/printer.py @@ -0,0 +1,13 @@ +from sympy.printing.ccode import C99CodePrinter +from sympy.codegen.ast import float32, float64 + +class CudaCodePrinter(C99CodePrinter): + pass + + def __init__(self, float_type, **args): + super(CudaCodePrinter, self).__init__(**args) + if float_type == 'float': + self.type_func_suffixes[float32] = 'f' + self.type_func_suffixes[float64] = 'f' + self.type_literal_suffixes[float32] = 'f' + self.type_literal_suffixes[float64] = 'f' |