From 18c54d79699db7554faa851c87d7113db67a8a08 Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Sun, 27 Oct 2019 14:05:21 +0100
Subject: Separate functions into separate template files

Selection of the desired templates is possible via a new `functions` parameter.
---
 boltzgen.py                                        |  15 +-
 boltzgen/kernel/generator.py                       |  46 ++--
 boltzgen/kernel/template/basic.cl.mako             |  84 -------
 boltzgen/kernel/template/basic.cpp.mako            | 280 ---------------------
 boltzgen/kernel/template/collect_moments.cl.mako   |  19 ++
 boltzgen/kernel/template/collect_moments.cpp.mako  |  24 ++
 .../kernel/template/collide_and_stream.cl.mako     |  31 +++
 .../kernel/template/collide_and_stream.cpp.mako    |  32 +++
 boltzgen/kernel/template/equilibrilize.cl.mako     |  13 +
 boltzgen/kernel/template/equilibrilize.cpp.mako    |  13 +
 boltzgen/kernel/template/example.cpp.mako          | 172 +++++++++++++
 boltzgen/kernel/template/momenta_boundary.cpp.mako |  35 +++
 boltzgen/kernel/template/preamble.cl.mako          |   7 +
 13 files changed, 381 insertions(+), 390 deletions(-)
 delete mode 100644 boltzgen/kernel/template/basic.cl.mako
 delete mode 100644 boltzgen/kernel/template/basic.cpp.mako
 create mode 100644 boltzgen/kernel/template/collect_moments.cl.mako
 create mode 100644 boltzgen/kernel/template/collect_moments.cpp.mako
 create mode 100644 boltzgen/kernel/template/collide_and_stream.cl.mako
 create mode 100644 boltzgen/kernel/template/collide_and_stream.cpp.mako
 create mode 100644 boltzgen/kernel/template/equilibrilize.cl.mako
 create mode 100644 boltzgen/kernel/template/equilibrilize.cpp.mako
 create mode 100644 boltzgen/kernel/template/example.cpp.mako
 create mode 100644 boltzgen/kernel/template/momenta_boundary.cpp.mako
 create mode 100644 boltzgen/kernel/template/preamble.cl.mako

diff --git a/boltzgen.py b/boltzgen.py
index 2b32661..5224942 100755
--- a/boltzgen.py
+++ b/boltzgen.py
@@ -15,7 +15,8 @@ argparser.add_argument('--geometry',  required = True, help = 'Size of the block
 argparser.add_argument('--tau',       required = True, help = 'BGK relaxation time')
 
 argparser.add_argument('--disable-cse', action = 'store_const', const = True, help = 'Disable common subexpression elimination')
-argparser.add_argument('--extra', action = 'append', nargs = '+', default = [], help = 'Additional generator parameters')
+argparser.add_argument('--functions',   action = 'append', nargs = '+', default = [], help = 'Function templates to be generated')
+argparser.add_argument('--extras',      action = 'append', nargs = '+', default = [], help = 'Additional generator parameters')
 
 args = argparser.parse_args()
 
@@ -29,5 +30,15 @@ generator = Generator(
 
 geometry = Geometry.parse(args.geometry)
 
-src = generator.kernel(args.language, args.precision, args.layout, geometry, sum(args.extra, []))
+functions = sum(args.functions, [])
+if len(functions) == 0:
+    functions += ['default']
+if 'default' in functions:
+    for f in ['collide_and_stream', 'equilibrilize', 'collect_moments', 'momenta_boundary']:
+        functions.insert(functions.index('default'), f)
+    functions.remove('default')
+
+extras = sum(args.extras, [])
+
+src = generator.kernel(args.language, args.precision, args.layout, geometry, functions, extras)
 print(src)
diff --git a/boltzgen/kernel/generator.py b/boltzgen/kernel/generator.py
index 8d0de37..04b0a64 100644
--- a/boltzgen/kernel/generator.py
+++ b/boltzgen/kernel/generator.py
@@ -7,25 +7,15 @@ import kernel.target.cl
 import kernel.target.cpp
 
 class Generator:
-    def __init__(self, descriptor, moments, collision, boundary = ''):
+    def __init__(self, descriptor, moments, collision):
         self.descriptor = descriptor
         self.moments    = moments
         self.collision  = collision
-        self.boundary   = boundary
 
-    def kernel(self, target, precision, layout, geometry, extras = []):
-        template_path = Path(__file__).parent/('template/basic.' + target + '.mako')
+    def instantiate(self, target, template, float_type, layout_impl, geometry, extras = []):
+        template_path = Path(__file__).parent/("template/%s.%s.mako" % (template, target))
         if not template_path.exists():
-            raise Exception("Target '%s' not supported" % target)
-
-        layout_impl = eval("kernel.target.%s.%s" % (target, layout))
-        if layout_impl is None:
-            raise Exception("Target '%s' doesn't support layout '%s'" % (target, layout))
-        else:
-            layout_impl = layout_impl(self.descriptor, geometry)
-
-        if geometry.dimension() != self.descriptor.d:
-            raise Exception('Geometry dimension must match descriptor dimension')
+            raise Exception("Target '%s' doesn't provide '%s'" % (target, template))
 
         return Template(filename = str(template_path)).render(
             descriptor = self.descriptor,
@@ -38,16 +28,24 @@ class Generator:
             collision_assignment = self.collision[1],
             ccode = sympy.ccode,
 
-            float_type = {
-                'single': 'float',
-                'double': 'double'
-            }.get(precision),
-
-            boundary_src = Template(self.boundary).render(
-                descriptor = self.descriptor,
-                geometry   = geometry,
-                float_type = precision
-            ),
+            float_type = float_type,
 
             extras = extras
         )
+
+    def kernel(self, target, precision, layout, geometry, functions = ['collide_and_stream'], extras = []):
+        layout_impl = eval("kernel.target.%s.%s" % (target, layout))
+        if layout_impl is None:
+            raise Exception("Target '%s' doesn't support layout '%s'" % (target, layout))
+        else:
+            layout_impl = layout_impl(self.descriptor, geometry)
+
+        if geometry.dimension() != self.descriptor.d:
+            raise Exception('Geometry dimension must match descriptor dimension')
+
+        float_type = {
+            'single': 'float',
+            'double': 'double'
+        }.get(precision)
+
+        return "\n".join(map(lambda f: self.instantiate(target, f, float_type, layout_impl, geometry, extras), functions))
diff --git a/boltzgen/kernel/template/basic.cl.mako b/boltzgen/kernel/template/basic.cl.mako
deleted file mode 100644
index b64a480..0000000
--- a/boltzgen/kernel/template/basic.cl.mako
+++ /dev/null
@@ -1,84 +0,0 @@
-% if float_type == 'double':
-#if defined(cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#elif defined(cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64 : enable
-#endif
-% endif
-
-__kernel void equilibrilize(__global ${float_type}* f_next,
-                            __global ${float_type}* f_prev)
-{
-    const unsigned int gid = ${layout.gid()};
-
-    __global ${float_type}* preshifted_f_next = f_next + gid;
-    __global ${float_type}* preshifted_f_prev = f_prev + gid;
-
-% for i, w_i in enumerate(descriptor.w):
-    preshifted_f_next[${layout.pop_offset(i)}] = ${w_i}.f;
-    preshifted_f_prev[${layout.pop_offset(i)}] = ${w_i}.f;
-% endfor
-}
-
-__kernel void collide_and_stream(__global ${float_type}* f_next,
-                                 __global ${float_type}* f_prev,
-                                 __global int* material,
-                                 unsigned int time)
-{
-    const unsigned int gid = ${layout.gid()};
-
-    const int m = material[gid];
-
-    if ( m == 0 ) {
-        return;
-    }
-
-    __global ${float_type}* preshifted_f_next = f_next + gid;
-    __global ${float_type}* preshifted_f_prev = f_prev + gid;
-
-% for i, c_i in enumerate(descriptor.c):
-    const ${float_type} f_curr_${i} = preshifted_f_prev[${layout.pop_offset(i) + layout.neighbor_offset(-c_i)}];
-% endfor
-
-% for i, expr in enumerate(moments_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(moments_assignment):
-    ${float_type} ${ccode(expr)}
-% endfor
-
-  ${boundary_src}
-
-% for i, expr in enumerate(collision_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(collision_assignment):
-    const ${float_type} ${ccode(expr)}
-% endfor
-
-% for i in range(0,descriptor.q):
-    preshifted_f_next[${layout.pop_offset(i)}] = f_next_${i};
-% endfor
-}
-
-__kernel void collect_moments(__global ${float_type}* f,
-                              __global ${float_type}* moments)
-{
-    const unsigned int gid = ${layout.gid()};
-
-    __global ${float_type}* preshifted_f = f + gid;
-
-% for i in range(0,descriptor.q):
-    const ${float_type} f_curr_${i} = preshifted_f[${layout.pop_offset(i)}];
-% endfor
-
-% for i, expr in enumerate(moments_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(moments_assignment):
-    moments[${layout.pop_offset(i)} + gid] = ${ccode(expr.rhs)};
-% endfor
-}
diff --git a/boltzgen/kernel/template/basic.cpp.mako b/boltzgen/kernel/template/basic.cpp.mako
deleted file mode 100644
index 118ef8c..0000000
--- a/boltzgen/kernel/template/basic.cpp.mako
+++ /dev/null
@@ -1,280 +0,0 @@
-void equilibrilize(${float_type}* f_next,
-                   ${float_type}* f_prev,
-                   std::size_t gid)
-{
-    ${float_type}* preshifted_f_next = f_next + gid*${layout.gid_offset()};
-    ${float_type}* preshifted_f_prev = f_prev + gid*${layout.gid_offset()};
-
-% for i, w_i in enumerate(descriptor.w):
-    preshifted_f_next[${layout.pop_offset(i)}] = ${w_i.evalf()};
-    preshifted_f_prev[${layout.pop_offset(i)}] = ${w_i.evalf()};
-% endfor
-}
-
-void collide_and_stream(      ${float_type}* f_next,
-                        const ${float_type}* f_prev,
-                        std::size_t gid)
-{
-          ${float_type}* preshifted_f_next = f_next + gid*${layout.gid_offset()};
-    const ${float_type}* preshifted_f_prev = f_prev + gid*${layout.gid_offset()};
-
-% for i, c_i in enumerate(descriptor.c):
-    const ${float_type} f_curr_${i} = preshifted_f_prev[${layout.pop_offset(i) + layout.neighbor_offset(-c_i)}];
-% endfor
-
-% for i, expr in enumerate(moments_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(moments_assignment):
-    ${float_type} ${ccode(expr)}
-% endfor
-
-% for i, expr in enumerate(collision_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(collision_assignment):
-    const ${float_type} ${ccode(expr)}
-% endfor
-
-% for i, expr in enumerate(collision_assignment):
-    preshifted_f_next[${layout.pop_offset(i)}] = f_next_${i};
-% endfor
-}
-
-void velocity_momenta_boundary(      ${float_type}* f_next,
-                               const ${float_type}* f_prev,
-                               std::size_t gid,
-                               ${float_type} velocity[${descriptor.d}])
-{
-          ${float_type}* preshifted_f_next = f_next + gid*${layout.gid_offset()};
-    const ${float_type}* preshifted_f_prev = f_prev + gid*${layout.gid_offset()};
-
-% for i, c_i in enumerate(descriptor.c):
-    const ${float_type} f_curr_${i} = preshifted_f_prev[${layout.pop_offset(i) + layout.neighbor_offset(-c_i)}];
-% endfor
-
-% for i, expr in enumerate(moments_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(moments_assignment):
-    ${float_type} ${ccode(expr)}
-% endfor
-% for i, expr in enumerate(moments_assignment[1:]):
-    ${expr.lhs} = velocity[${i}];
-% endfor
-
-% for i, expr in enumerate(collision_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(collision_assignment):
-    const ${float_type} ${ccode(expr)}
-% endfor
-
-% for i, expr in enumerate(collision_assignment):
-    preshifted_f_next[${layout.pop_offset(i)}] = f_next_${i};
-% endfor
-}
-
-void collect_moments(const ${float_type}* f,
-                     std::size_t gid,
-                     ${float_type}& rho,
-                     ${float_type} u[${descriptor.d}])
-{
-    const ${float_type}* preshifted_f = f + gid*${layout.gid_offset()};
-
-% for i in range(0,descriptor.q):
-    const ${float_type} f_curr_${i} = preshifted_f[${layout.pop_offset(i)}];
-% endfor
-
-% for i, expr in enumerate(moments_subexpr):
-    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
-% endfor
-
-% for i, expr in enumerate(moments_assignment):
-%   if i == 0:
-    rho = ${ccode(expr.rhs)};
-%   else:
-    u[${i-1}] = ${ccode(expr.rhs)};
-%   endif
-% endfor
-}
-
-% if 'moments_vtk' in extras:
-void collect_moments_to_vtk(const std::string& path, ${float_type}* f) {
-    std::ofstream fout;
-    fout.open(path.c_str());
-
-    fout << "# vtk DataFile Version 3.0\n";
-    fout << "lbm_output\n";
-    fout << "ASCII\n";
-    fout << "DATASET RECTILINEAR_GRID\n";
-% if descriptor.d == 2:
-    fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " 1" << "\n";
-% else:
-    fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " " << ${geometry.size_z-2} << "\n";
-% endif
-
-    fout << "X_COORDINATES " << ${geometry.size_x-2} << " float\n";
-    for( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
-        fout << x << " ";
-    }
-
-    fout << "\nY_COORDINATES " << ${geometry.size_y-2} << " float\n";
-    for( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
-        fout << y << " ";
-    }
-
-% if descriptor.d == 2:
-    fout << "\nZ_COORDINATES " << 1 << " float\n";
-    fout << 0 << "\n";
-    fout << "POINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2)} << "\n";
-% else:
-    fout << "\nZ_COORDINATES " << ${geometry.size_z-2} << " float\n";
-    for( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
-        fout << z << " ";
-    }
-    fout << "\nPOINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2) * (geometry.size_z-2)} << "\n";
-% endif
-
-    ${float_type} rho;
-    ${float_type} u[${descriptor.d}];
-
-    fout << "VECTORS velocity float\n";
-% if descriptor.d == 2:
-    for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
-        for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
-            collect_moments(f, x*${geometry.size_y}+y, rho, u);
-            fout << u[0] << " " << u[1] << " 0\n";
-        }
-    }
-% else:
-    for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
-        for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
-            for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
-                collect_moments(f, x*${geometry.size_y*geometry.size_z}+y*${geometry.size_z}+z, rho, u);
-                fout << u[0] << " " << u[1] << " " << u[2] << "\n";
-            }
-        }
-    }
-% endif
-
-    fout << "SCALARS density float 1\n";
-    fout << "LOOKUP_TABLE default\n";
-% if descriptor.d == 2:
-    for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
-        for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
-            collect_moments(f, x*${geometry.size_y}+y, rho, u);
-            fout << rho << "\n";
-        }
-    }
-% else:
-    for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
-        for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
-            for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
-                collect_moments(f, x*${geometry.size_y*geometry.size_z}+y*${geometry.size_z}+z, rho, u);
-                fout << rho << "\n";
-            }
-        }
-    }
-% endif
-
-    fout.close();
-}
-% endif
-
-% if 'test_ldc' in extras:
-void test_ldc(std::size_t nStep)
-{
-    auto f_a = std::make_unique<${float_type}[]>(${geometry.volume*descriptor.q});
-    auto f_b = std::make_unique<${float_type}[]>(${geometry.volume*descriptor.q});
-
-    ${float_type}* f_prev = f_a.get();
-    ${float_type}* f_next = f_b.get();
-
-    std::vector<std::size_t> bulk;
-    std::vector<std::size_t> lid_bc;
-    std::vector<std::size_t> box_bc;
-
-    for (int iX = 1; iX < ${geometry.size_x-1}; ++iX) {
-        for (int iY = 1; iY < ${geometry.size_y-1}; ++iY) {
-% if descriptor.d == 2:
-            const std::size_t iCell = iX*${geometry.size_y} + iY;
-            if (iY == ${geometry.size_y-2}) {
-                lid_bc.emplace_back(iCell);
-            } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1) {
-                box_bc.emplace_back(iCell);
-            } else {
-                bulk.emplace_back(iCell);
-            }
-% elif descriptor.d == 3:
-            for (int iZ = 0; iZ < ${geometry.size_z}; ++iZ) {
-                const std::size_t iCell = iX*${geometry.size_y*geometry.size_z} + iY*${geometry.size_z} + iZ;
-                if (iZ == ${geometry.size_z-2}) {
-                    lid_bc.emplace_back(iCell);
-                } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1 || iY == ${geometry.size_y-2} || iZ == 1) {
-                    box_bc.emplace_back(iCell);
-                } else {
-                    bulk.emplace_back(iCell);
-                }
-            }
-% endif
-        }
-    }
-
-    for (std::size_t iCell = 0; iCell < ${geometry.volume}; ++iCell) {
-        equilibrilize(f_prev, f_next, iCell);
-    }
-
-    const auto start = std::chrono::high_resolution_clock::now();
-
-    for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
-        if (iStep % 2 == 0) {
-            f_next = f_a.get();
-            f_prev = f_b.get();
-        } else {
-            f_next = f_b.get();
-            f_prev = f_a.get();
-        }
-
-% if 'omp_parallel_for' in extras:
-#pragma omp parallel for
-% endif
-        for (std::size_t i = 0; i < bulk.size(); ++i) {
-            collide_and_stream(f_next, f_prev, bulk[i]);
-        }
-        ${float_type} u[${descriptor.d}] { 0. };
-% if 'omp_parallel_for' in extras:
-#pragma omp parallel for
-% endif
-        for (std::size_t i = 0; i < box_bc.size(); ++i) {
-            velocity_momenta_boundary(f_next, f_prev, box_bc[i], u);
-        }
-        u[0] = 0.05;
-% if 'omp_parallel_for' in extras:
-#pragma omp parallel for
-% endif
-        for (std::size_t i = 0; i < lid_bc.size(); ++i) {
-            velocity_momenta_boundary(f_next, f_prev, lid_bc[i], u);
-        }
-    }
-
-    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
-        std::chrono::high_resolution_clock::now() - start);
-
-    std::cout << "#bulk   : " << bulk.size()   << std::endl;
-    std::cout << "#lid    : " << lid_bc.size() << std::endl;
-    std::cout << "#wall   : " << box_bc.size() << std::endl;
-    std::cout << "#steps  : " << nStep         << std::endl;
-    std::cout << std::endl;
-    std::cout << "MLUPS   : " << nStep*${geometry.volume}/(1e6*duration.count()) << std::endl;
-
-% if 'moments_vtk' in extras:
-    collect_moments_to_vtk("test.vtk", f_next);
-% endif
-}
-% endif
-
diff --git a/boltzgen/kernel/template/collect_moments.cl.mako b/boltzgen/kernel/template/collect_moments.cl.mako
new file mode 100644
index 0000000..b07b759
--- /dev/null
+++ b/boltzgen/kernel/template/collect_moments.cl.mako
@@ -0,0 +1,19 @@
+__kernel void collect_moments(__global ${float_type}* f,
+                              __global ${float_type}* moments)
+{
+    const unsigned int gid = ${layout.gid()};
+
+    __global ${float_type}* preshifted_f = f + gid;
+
+% for i in range(0,descriptor.q):
+    const ${float_type} f_curr_${i} = preshifted_f[${layout.pop_offset(i)}];
+% endfor
+
+% for i, expr in enumerate(moments_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+    moments[${layout.pop_offset(i)} + gid] = ${ccode(expr.rhs)};
+% endfor
+}
diff --git a/boltzgen/kernel/template/collect_moments.cpp.mako b/boltzgen/kernel/template/collect_moments.cpp.mako
new file mode 100644
index 0000000..8c37db2
--- /dev/null
+++ b/boltzgen/kernel/template/collect_moments.cpp.mako
@@ -0,0 +1,24 @@
+void collect_moments(const ${float_type}* f,
+                     std::size_t gid,
+                     ${float_type}& rho,
+                     ${float_type} u[${descriptor.d}])
+{
+    const ${float_type}* preshifted_f = f + gid*${layout.gid_offset()};
+
+% for i in range(0,descriptor.q):
+    const ${float_type} f_curr_${i} = preshifted_f[${layout.pop_offset(i)}];
+% endfor
+
+% for i, expr in enumerate(moments_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+%   if i == 0:
+    rho = ${ccode(expr.rhs)};
+%   else:
+    u[${i-1}] = ${ccode(expr.rhs)};
+%   endif
+% endfor
+}
+
diff --git a/boltzgen/kernel/template/collide_and_stream.cl.mako b/boltzgen/kernel/template/collide_and_stream.cl.mako
new file mode 100644
index 0000000..28cfa57
--- /dev/null
+++ b/boltzgen/kernel/template/collide_and_stream.cl.mako
@@ -0,0 +1,31 @@
+__kernel void collide_and_stream(__global ${float_type}* f_next,
+                                 __global ${float_type}* f_prev,
+                                 unsigned int gid)
+{
+    __global ${float_type}* preshifted_f_next = f_next + gid;
+    __global ${float_type}* preshifted_f_prev = f_prev + gid;
+
+% for i, c_i in enumerate(descriptor.c):
+    const ${float_type} f_curr_${i} = preshifted_f_prev[${layout.pop_offset(i) + layout.neighbor_offset(-c_i)}];
+% endfor
+
+% for i, expr in enumerate(moments_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+    ${float_type} ${ccode(expr)}
+% endfor
+
+% for i, expr in enumerate(collision_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(collision_assignment):
+    const ${float_type} ${ccode(expr)}
+% endfor
+
+% for i in range(0,descriptor.q):
+    preshifted_f_next[${layout.pop_offset(i)}] = f_next_${i};
+% endfor
+}
diff --git a/boltzgen/kernel/template/collide_and_stream.cpp.mako b/boltzgen/kernel/template/collide_and_stream.cpp.mako
new file mode 100644
index 0000000..71c62b7
--- /dev/null
+++ b/boltzgen/kernel/template/collide_and_stream.cpp.mako
@@ -0,0 +1,32 @@
+void collide_and_stream(      ${float_type}* f_next,
+                        const ${float_type}* f_prev,
+                        std::size_t gid)
+{
+          ${float_type}* preshifted_f_next = f_next + gid*${layout.gid_offset()};
+    const ${float_type}* preshifted_f_prev = f_prev + gid*${layout.gid_offset()};
+
+% for i, c_i in enumerate(descriptor.c):
+    const ${float_type} f_curr_${i} = preshifted_f_prev[${layout.pop_offset(i) + layout.neighbor_offset(-c_i)}];
+% endfor
+
+% for i, expr in enumerate(moments_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+    ${float_type} ${ccode(expr)}
+% endfor
+
+% for i, expr in enumerate(collision_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(collision_assignment):
+    const ${float_type} ${ccode(expr)}
+% endfor
+
+% for i, expr in enumerate(collision_assignment):
+    preshifted_f_next[${layout.pop_offset(i)}] = f_next_${i};
+% endfor
+}
+
diff --git a/boltzgen/kernel/template/equilibrilize.cl.mako b/boltzgen/kernel/template/equilibrilize.cl.mako
new file mode 100644
index 0000000..aa2246c
--- /dev/null
+++ b/boltzgen/kernel/template/equilibrilize.cl.mako
@@ -0,0 +1,13 @@
+__kernel void equilibrilize(__global ${float_type}* f_next,
+                            __global ${float_type}* f_prev)
+{
+    const unsigned int gid = ${layout.gid()};
+
+    __global ${float_type}* preshifted_f_next = f_next + gid;
+    __global ${float_type}* preshifted_f_prev = f_prev + gid;
+
+% for i, w_i in enumerate(descriptor.w):
+    preshifted_f_next[${layout.pop_offset(i)}] = ${w_i}.f;
+    preshifted_f_prev[${layout.pop_offset(i)}] = ${w_i}.f;
+% endfor
+}
diff --git a/boltzgen/kernel/template/equilibrilize.cpp.mako b/boltzgen/kernel/template/equilibrilize.cpp.mako
new file mode 100644
index 0000000..9f082a1
--- /dev/null
+++ b/boltzgen/kernel/template/equilibrilize.cpp.mako
@@ -0,0 +1,13 @@
+void equilibrilize(${float_type}* f_next,
+                   ${float_type}* f_prev,
+                   std::size_t gid)
+{
+    ${float_type}* preshifted_f_next = f_next + gid*${layout.gid_offset()};
+    ${float_type}* preshifted_f_prev = f_prev + gid*${layout.gid_offset()};
+
+% for i, w_i in enumerate(descriptor.w):
+    preshifted_f_next[${layout.pop_offset(i)}] = ${w_i.evalf()};
+    preshifted_f_prev[${layout.pop_offset(i)}] = ${w_i.evalf()};
+% endfor
+}
+
diff --git a/boltzgen/kernel/template/example.cpp.mako b/boltzgen/kernel/template/example.cpp.mako
new file mode 100644
index 0000000..d979bfd
--- /dev/null
+++ b/boltzgen/kernel/template/example.cpp.mako
@@ -0,0 +1,172 @@
+% if 'moments_vtk' in extras:
+void collect_moments_to_vtk(const std::string& path, ${float_type}* f) {
+    std::ofstream fout;
+    fout.open(path.c_str());
+
+    fout << "# vtk DataFile Version 3.0\n";
+    fout << "lbm_output\n";
+    fout << "ASCII\n";
+    fout << "DATASET RECTILINEAR_GRID\n";
+% if descriptor.d == 2:
+    fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " 1" << "\n";
+% else:
+    fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " " << ${geometry.size_z-2} << "\n";
+% endif
+
+    fout << "X_COORDINATES " << ${geometry.size_x-2} << " float\n";
+    for( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+        fout << x << " ";
+    }
+
+    fout << "\nY_COORDINATES " << ${geometry.size_y-2} << " float\n";
+    for( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+        fout << y << " ";
+    }
+
+% if descriptor.d == 2:
+    fout << "\nZ_COORDINATES " << 1 << " float\n";
+    fout << 0 << "\n";
+    fout << "POINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2)} << "\n";
+% else:
+    fout << "\nZ_COORDINATES " << ${geometry.size_z-2} << " float\n";
+    for( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
+        fout << z << " ";
+    }
+    fout << "\nPOINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2) * (geometry.size_z-2)} << "\n";
+% endif
+
+    ${float_type} rho;
+    ${float_type} u[${descriptor.d}];
+
+    fout << "VECTORS velocity float\n";
+% if descriptor.d == 2:
+    for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+        for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+            collect_moments(f, x*${geometry.size_y}+y, rho, u);
+            fout << u[0] << " " << u[1] << " 0\n";
+        }
+    }
+% else:
+    for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
+        for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+            for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+                collect_moments(f, x*${geometry.size_y*geometry.size_z}+y*${geometry.size_z}+z, rho, u);
+                fout << u[0] << " " << u[1] << " " << u[2] << "\n";
+            }
+        }
+    }
+% endif
+
+    fout << "SCALARS density float 1\n";
+    fout << "LOOKUP_TABLE default\n";
+% if descriptor.d == 2:
+    for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+        for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+            collect_moments(f, x*${geometry.size_y}+y, rho, u);
+            fout << rho << "\n";
+        }
+    }
+% else:
+    for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
+        for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+            for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+                collect_moments(f, x*${geometry.size_y*geometry.size_z}+y*${geometry.size_z}+z, rho, u);
+                fout << rho << "\n";
+            }
+        }
+    }
+% endif
+
+    fout.close();
+}
+% endif
+
+void test_ldc(std::size_t nStep)
+{
+    auto f_a = std::make_unique<${float_type}[]>(${geometry.volume*descriptor.q});
+    auto f_b = std::make_unique<${float_type}[]>(${geometry.volume*descriptor.q});
+
+    ${float_type}* f_prev = f_a.get();
+    ${float_type}* f_next = f_b.get();
+
+    std::vector<std::size_t> bulk;
+    std::vector<std::size_t> lid_bc;
+    std::vector<std::size_t> box_bc;
+
+    for (int iX = 1; iX < ${geometry.size_x-1}; ++iX) {
+        for (int iY = 1; iY < ${geometry.size_y-1}; ++iY) {
+% if descriptor.d == 2:
+            const std::size_t iCell = iX*${geometry.size_y} + iY;
+            if (iY == ${geometry.size_y-2}) {
+                lid_bc.emplace_back(iCell);
+            } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1) {
+                box_bc.emplace_back(iCell);
+            } else {
+                bulk.emplace_back(iCell);
+            }
+% elif descriptor.d == 3:
+            for (int iZ = 0; iZ < ${geometry.size_z}; ++iZ) {
+                const std::size_t iCell = iX*${geometry.size_y*geometry.size_z} + iY*${geometry.size_z} + iZ;
+                if (iZ == ${geometry.size_z-2}) {
+                    lid_bc.emplace_back(iCell);
+                } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1 || iY == ${geometry.size_y-2} || iZ == 1) {
+                    box_bc.emplace_back(iCell);
+                } else {
+                    bulk.emplace_back(iCell);
+                }
+            }
+% endif
+        }
+    }
+
+    for (std::size_t iCell = 0; iCell < ${geometry.volume}; ++iCell) {
+        equilibrilize(f_prev, f_next, iCell);
+    }
+
+    const auto start = std::chrono::high_resolution_clock::now();
+
+    for (std::size_t iStep = 0; iStep < nStep; ++iStep) {
+        if (iStep % 2 == 0) {
+            f_next = f_a.get();
+            f_prev = f_b.get();
+        } else {
+            f_next = f_b.get();
+            f_prev = f_a.get();
+        }
+
+% if 'omp_parallel_for' in extras:
+#pragma omp parallel for
+% endif
+        for (std::size_t i = 0; i < bulk.size(); ++i) {
+            collide_and_stream(f_next, f_prev, bulk[i]);
+        }
+        ${float_type} u[${descriptor.d}] { 0. };
+% if 'omp_parallel_for' in extras:
+#pragma omp parallel for
+% endif
+        for (std::size_t i = 0; i < box_bc.size(); ++i) {
+            velocity_momenta_boundary(f_next, f_prev, box_bc[i], u);
+        }
+        u[0] = 0.05;
+% if 'omp_parallel_for' in extras:
+#pragma omp parallel for
+% endif
+        for (std::size_t i = 0; i < lid_bc.size(); ++i) {
+            velocity_momenta_boundary(f_next, f_prev, lid_bc[i], u);
+        }
+    }
+
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+        std::chrono::high_resolution_clock::now() - start);
+
+    std::cout << "#bulk   : " << bulk.size()   << std::endl;
+    std::cout << "#lid    : " << lid_bc.size() << std::endl;
+    std::cout << "#wall   : " << box_bc.size() << std::endl;
+    std::cout << "#steps  : " << nStep         << std::endl;
+    std::cout << std::endl;
+    std::cout << "MLUPS   : " << nStep*${geometry.volume}/(1e6*duration.count()) << std::endl;
+
+% if 'moments_vtk' in extras:
+    collect_moments_to_vtk("test.vtk", f_next);
+% endif
+}
diff --git a/boltzgen/kernel/template/momenta_boundary.cpp.mako b/boltzgen/kernel/template/momenta_boundary.cpp.mako
new file mode 100644
index 0000000..d7e0dd2
--- /dev/null
+++ b/boltzgen/kernel/template/momenta_boundary.cpp.mako
@@ -0,0 +1,35 @@
+void velocity_momenta_boundary(      ${float_type}* f_next,
+                               const ${float_type}* f_prev,
+                               std::size_t gid,
+                               ${float_type} velocity[${descriptor.d}])
+{
+          ${float_type}* preshifted_f_next = f_next + gid*${layout.gid_offset()};
+    const ${float_type}* preshifted_f_prev = f_prev + gid*${layout.gid_offset()};
+
+% for i, c_i in enumerate(descriptor.c):
+    const ${float_type} f_curr_${i} = preshifted_f_prev[${layout.pop_offset(i) + layout.neighbor_offset(-c_i)}];
+% endfor
+
+% for i, expr in enumerate(moments_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(moments_assignment):
+    ${float_type} ${ccode(expr)}
+% endfor
+% for i, expr in enumerate(moments_assignment[1:]):
+    ${expr.lhs} = velocity[${i}];
+% endfor
+
+% for i, expr in enumerate(collision_subexpr):
+    const ${float_type} ${expr[0]} = ${ccode(expr[1])};
+% endfor
+
+% for i, expr in enumerate(collision_assignment):
+    const ${float_type} ${ccode(expr)}
+% endfor
+
+% for i, expr in enumerate(collision_assignment):
+    preshifted_f_next[${layout.pop_offset(i)}] = f_next_${i};
+% endfor
+}
diff --git a/boltzgen/kernel/template/preamble.cl.mako b/boltzgen/kernel/template/preamble.cl.mako
new file mode 100644
index 0000000..c47254d
--- /dev/null
+++ b/boltzgen/kernel/template/preamble.cl.mako
@@ -0,0 +1,7 @@
+% if float_type == 'double':
+#if defined(cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#endif
+% endif
-- 
cgit v1.2.3