aboutsummaryrefslogtreecommitdiff
path: root/boltzgen/kernel/template
diff options
context:
space:
mode:
authorAdrian Kummerlaender2019-11-10 21:14:07 +0100
committerAdrian Kummerlaender2019-11-10 21:18:57 +0100
commit4a2885ad3ae0396486d288df94339d0c45e6db8b (patch)
tree1a0b5aa000bbcde65fa020381a02b19bb452e284 /boltzgen/kernel/template
parentd136bb30bc8a9393372ec905aea500a0b61000e3 (diff)
downloadboltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar
boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.gz
boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.bz2
boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.tar.xz
boltzgen-4a2885ad3ae0396486d288df94339d0c45e6db8b.zip
Implement basic CUDA target
Currently only for the SSS streaming pattern. CudaCodePrinter in `utility/printer.py` is required to add a 'f' suffix to all single precision floating point literals. If this is not done (when targeting single precision) most calculations happen in double precision which destroys performance. (In OpenCL this is not necessary as we can simply set the `-cl-single-precision-constant` flag. Sadly such a flag doesn't seem to exist for nvcc.)
Diffstat (limited to 'boltzgen/kernel/template')
-rw-r--r--boltzgen/kernel/template/bounce_back_boundary.cuda.mako16
-rw-r--r--boltzgen/kernel/template/collect_moments.cuda.mako20
-rw-r--r--boltzgen/kernel/template/collide_and_stream.cuda.mako17
-rw-r--r--boltzgen/kernel/template/equilibrilize.cuda.mako11
-rw-r--r--boltzgen/kernel/template/momenta_boundary.cuda.mako39
-rw-r--r--boltzgen/kernel/template/pattern/SSS.cuda.mako81
-rw-r--r--boltzgen/kernel/template/update_sss_control_structure.cpp.mako5
-rw-r--r--boltzgen/kernel/template/update_sss_control_structure.cuda.mako21
8 files changed, 206 insertions, 4 deletions
diff --git a/boltzgen/kernel/template/bounce_back_boundary.cuda.mako b/boltzgen/kernel/template/bounce_back_boundary.cuda.mako
new file mode 100644
index 0000000..09499e2
--- /dev/null
+++ b/boltzgen/kernel/template/bounce_back_boundary.cuda.mako
@@ -0,0 +1,16 @@
+<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/>
+<%
+from boltzgen.utility.printer import CudaCodePrinter
+ccode = CudaCodePrinter(float_type).doprint
+subexpr, assignment = model.collision(f_eq = model.equilibrium(resolve_moments = True))
+%>
+
+<%call expr="pattern.operator('bounce_back_boundary')">
+% for i, expr in enumerate(subexpr):
+ const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, c_i in enumerate(descriptor.c):
+ const ${float_type} ${assignment[i].lhs} = ${ccode(assignment[descriptor.c.index(-c_i)].rhs)};
+% endfor
+</%call>
diff --git a/boltzgen/kernel/template/collect_moments.cuda.mako b/boltzgen/kernel/template/collect_moments.cuda.mako
new file mode 100644
index 0000000..36f03b0
--- /dev/null
+++ b/boltzgen/kernel/template/collect_moments.cuda.mako
@@ -0,0 +1,20 @@
+<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/>
+<%
+from boltzgen.utility.printer import CudaCodePrinter
+ccode = CudaCodePrinter(float_type).doprint
+moments_subexpr, moments_assignment = model.moments()
+%>
+
+<%call expr="pattern.functor('collect_moments', [('%s*' % float_type, 'rho'), ('%s*' % float_type, 'u')])">
+% for i, expr in enumerate(moments_subexpr):
+ const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+% if i == 0:
+ rho[gid] = ${ccode(expr.rhs)};
+% else:
+ u[gid*${descriptor.d} + ${i-1}] = ${ccode(expr.rhs)};
+% endif
+% endfor
+</%call>
diff --git a/boltzgen/kernel/template/collide_and_stream.cuda.mako b/boltzgen/kernel/template/collide_and_stream.cuda.mako
new file mode 100644
index 0000000..b46cf0f
--- /dev/null
+++ b/boltzgen/kernel/template/collide_and_stream.cuda.mako
@@ -0,0 +1,17 @@
+<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/>
+<%
+from boltzgen.utility.printer import CudaCodePrinter
+ccode = CudaCodePrinter(float_type).doprint
+subexpr, assignment = model.collision(f_eq = model.equilibrium(resolve_moments = True))
+
+%>
+
+<%call expr="pattern.operator('collide_and_stream')">
+% for i, expr in enumerate(subexpr):
+ const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(assignment):
+ const ${float_type} ${ccode(expr)}
+% endfor
+</%call>
diff --git a/boltzgen/kernel/template/equilibrilize.cuda.mako b/boltzgen/kernel/template/equilibrilize.cuda.mako
new file mode 100644
index 0000000..2f4cdee
--- /dev/null
+++ b/boltzgen/kernel/template/equilibrilize.cuda.mako
@@ -0,0 +1,11 @@
+<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/>
+<%
+from boltzgen.utility.printer import CudaCodePrinter
+ccode = CudaCodePrinter(float_type).doprint
+%>
+
+<%call expr="pattern.operator('equilibrilize')">
+% for i, w_i in enumerate(descriptor.w):
+ const ${float_type} f_next_${i} = ${ccode(w_i.evalf())};
+% endfor
+</%call>
diff --git a/boltzgen/kernel/template/momenta_boundary.cuda.mako b/boltzgen/kernel/template/momenta_boundary.cuda.mako
new file mode 100644
index 0000000..408a12d
--- /dev/null
+++ b/boltzgen/kernel/template/momenta_boundary.cuda.mako
@@ -0,0 +1,39 @@
+<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/>
+<%
+from boltzgen.utility.printer import CudaCodePrinter
+ccode = CudaCodePrinter(float_type).doprint
+moments_subexpr, moments_assignment = model.moments()
+collision_subexpr, collision_assignment = model.collision(f_eq = model.equilibrium(resolve_moments = False))
+%>
+
+<%def name="momenta_boundary(name, params)">
+<%call expr="pattern.operator('%s_momenta_boundary' % name, params)">
+% for i, expr in enumerate(moments_subexpr):
+ const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+ ${caller.body()}
+
+% for i, expr in enumerate(collision_subexpr):
+ const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(collision_assignment):
+ const ${float_type} ${ccode(expr)}
+% endfor
+</%call>
+</%def>
+
+<%call expr="momenta_boundary('velocity', list(map(lambda i: (float_type, 'velocity_%d' % i), range(descriptor.d))))">
+ const ${float_type} ${ccode(moments_assignment[0])}
+% for i, expr in enumerate(moments_assignment[1:]):
+ const ${float_type} ${expr.lhs} = velocity_${i};
+% endfor
+</%call>
+
+<%call expr="momenta_boundary('density', [(float_type, 'density')])">
+ const ${float_type} ${moments_assignment[0].lhs} = density;
+% for i, expr in enumerate(moments_assignment[1:]):
+ const ${float_type} ${ccode(expr)}
+% endfor
+</%call>
diff --git a/boltzgen/kernel/template/pattern/SSS.cuda.mako b/boltzgen/kernel/template/pattern/SSS.cuda.mako
new file mode 100644
index 0000000..8f571c7
--- /dev/null
+++ b/boltzgen/kernel/template/pattern/SSS.cuda.mako
@@ -0,0 +1,81 @@
+<%def name="operator(name, params = None)">
+<%
+if layout.__class__.__name__ != 'SOA':
+ raise Exception('SSS pattern only works for the AOS memory layout')
+%>
+__global__ void ${name}(
+ ${float_type}** f
+% if 'cell_list_dispatch' in extras:
+ , std::size_t* cells
+ , std::size_t cell_count
+% else:
+ , std::size_t gid
+% endif
+% if params is not None:
+% for param_type, param_name in params:
+ , ${param_type} ${param_name}
+% endfor
+% endif
+) {
+% if 'cell_list_dispatch' in extras:
+ const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (!(index < cell_count)) {
+ return;
+ }
+ const std::size_t gid = cells[index];
+% endif
+
+% for i, c_i in enumerate(descriptor.c):
+ ${float_type}* preshifted_f_${i} = f[${i}] + ${layout.cell_preshift('gid')};
+% endfor
+
+% for i, c_i in enumerate(descriptor.c):
+ const ${float_type} f_curr_${i} = *preshifted_f_${i};
+% endfor
+
+ ${caller.body()}
+
+% for i, c_i in enumerate(descriptor.c):
+ *preshifted_f_${i} = f_next_${descriptor.c.index(-c_i)};
+% endfor
+}
+</%def>
+
+<%def name="functor(name, params = None)">
+<%
+if layout.__class__.__name__ != 'SOA':
+ raise Exception('SSS pattern only works for the AOS memory layout')
+%>
+__global__ void ${name}(
+ ${float_type}** f
+% if 'cell_list_dispatch' in extras:
+ , std::size_t* cells
+ , std::size_t cell_count
+% else:
+ , std::size_t gid
+% endif
+% if params is not None:
+% for param_type, param_name in params:
+ , ${param_type} ${param_name}
+% endfor
+% endif
+) {
+% if 'cell_list_dispatch' in extras:
+ const std::size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (!(index < cell_count)) {
+ return;
+ }
+ const std::size_t gid = cells[index];
+% endif
+
+% for i, c_i in enumerate(descriptor.c):
+ const ${float_type}* preshifted_f_${i} = f[${i}] + ${layout.cell_preshift('gid')};
+% endfor
+
+% for i, c_i in enumerate(descriptor.c):
+ const ${float_type} f_curr_${i} = *preshifted_f_${descriptor.c.index(-c_i)};
+% endfor
+
+ ${caller.body()}
+}
+</%def>
diff --git a/boltzgen/kernel/template/update_sss_control_structure.cpp.mako b/boltzgen/kernel/template/update_sss_control_structure.cpp.mako
index 9197022..b7661a3 100644
--- a/boltzgen/kernel/template/update_sss_control_structure.cpp.mako
+++ b/boltzgen/kernel/template/update_sss_control_structure.cpp.mako
@@ -7,9 +7,6 @@ void update_sss_control_structure(${float_type}** f) {
${float_type}* f_old_${i} = f[${i}];
% endfor
% for i, c_i in enumerate(descriptor.c):
- f[${i}] = f_old_${descriptor.c.index(-c_i)};
-% endfor
-% for i, c_i in enumerate(descriptor.c):
- f[${i}] += ${layout.neighbor_offset(-c_i)};
+ f[${i}] = f_old_${descriptor.c.index(-c_i)} + ${layout.neighbor_offset(-c_i)};
% endfor
}
diff --git a/boltzgen/kernel/template/update_sss_control_structure.cuda.mako b/boltzgen/kernel/template/update_sss_control_structure.cuda.mako
new file mode 100644
index 0000000..2a98b94
--- /dev/null
+++ b/boltzgen/kernel/template/update_sss_control_structure.cuda.mako
@@ -0,0 +1,21 @@
+<%
+if streaming != 'SSS':
+ raise Exception('"update_sss_control_structure" function only makes sense for the SSS pattern')
+
+padding = (max(geometry.size_x,geometry.size_y,geometry.size_z)+1)**(descriptor.d-1)
+%>
+
+__global__ void init_sss_control_structure(${float_type}* f, ${float_type}** control) {
+% for i, c_i in enumerate(descriptor.c):
+ control[${i}] = f + ${padding + layout.pop_offset(i, 2*padding)};
+% endfor
+}
+
+__global__ void update_sss_control_structure(${float_type}** f) {
+% for i, c_i in enumerate(descriptor.c):
+ ${float_type}* f_old_${i} = f[${i}];
+% endfor
+% for i, c_i in enumerate(descriptor.c):
+ f[${i}] = f_old_${descriptor.c.index(-c_i)} + ${layout.neighbor_offset(-c_i)};
+% endfor
+}