From 4a2885ad3ae0396486d288df94339d0c45e6db8b Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Sun, 10 Nov 2019 21:14:07 +0100
Subject: Implement basic CUDA target

Currently only for the SSS streaming pattern.

CudaCodePrinter in `utility/printer.py` is required to add a 'f' suffix
to all single precision floating point literals. If this is not done
(when targeting single precision) most calculations happen in double
precision which destroys performance. (In OpenCL this is not necessary
as we can simply set the `-cl-single-precision-constant` flag. Sadly
such a flag doesn't seem to exist for nvcc.)
---
 boltzgen/kernel/template/collect_moments.cuda.mako | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 boltzgen/kernel/template/collect_moments.cuda.mako

(limited to 'boltzgen/kernel/template/collect_moments.cuda.mako')

diff --git a/boltzgen/kernel/template/collect_moments.cuda.mako b/boltzgen/kernel/template/collect_moments.cuda.mako
new file mode 100644
index 0000000..36f03b0
--- /dev/null
+++ b/boltzgen/kernel/template/collect_moments.cuda.mako
@@ -0,0 +1,20 @@
+<%namespace name="pattern" file="${'/pattern/%s.cuda.mako' % context['streaming']}"/>
+<%
+from boltzgen.utility.printer import CudaCodePrinter
+ccode = CudaCodePrinter(float_type).doprint
+moments_subexpr, moments_assignment = model.moments()
+%>
+
+<%call expr="pattern.functor('collect_moments', [('%s*' % float_type, 'rho'), ('%s*' % float_type, 'u')])">
+% for i, expr in enumerate(moments_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+%   if i == 0:
+    rho[gid] = ${ccode(expr.rhs)};
+%   else:
+    u[gid*${descriptor.d} + ${i-1}] = ${ccode(expr.rhs)};
+%   endif
+% endfor
+</%call>
-- 
cgit v1.2.3