From 988818c224197eb506ac2f42edef13d7bd1cc11a Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Sun, 10 Nov 2019 21:25:07 +0100
Subject: Add LDC example for CUDA target

---
 lid_driven_cavity/cuda/CMakeLists.txt    |  30 +++++
 lid_driven_cavity/cuda/config.py         |  15 +++
 lid_driven_cavity/cuda/generate.py       |  43 ++++++
 lid_driven_cavity/cuda/ldc.cuda.SSS.mako | 221 +++++++++++++++++++++++++++++++
 shell.nix                                |   5 +-
 5 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 lid_driven_cavity/cuda/CMakeLists.txt
 create mode 100644 lid_driven_cavity/cuda/config.py
 create mode 100755 lid_driven_cavity/cuda/generate.py
 create mode 100644 lid_driven_cavity/cuda/ldc.cuda.SSS.mako
diff --git a/lid_driven_cavity/cuda/CMakeLists.txt b/lid_driven_cavity/cuda/CMakeLists.txt
new file mode 100644
index 0000000..7c920f2
--- /dev/null
+++ b/lid_driven_cavity/cuda/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.10)
+project(ldc LANGUAGES CXX)
+find_package(CUDA QUIET REQUIRED)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+add_custom_command(
+	OUTPUT
+		kernel.h
+		ldc.cu
+	COMMAND
+		${CMAKE_CURRENT_SOURCE_DIR}/generate.py --output ${CMAKE_CURRENT_BINARY_DIR}
+	WORKING_DIRECTORY
+		${CMAKE_CURRENT_SOURCE_DIR}
+	DEPENDS
+		generate.py
+		config.py
+		ldc.cuda.SSS.mako
+)
+
+include_directories(
+	${CMAKE_BINARY_DIR}
+)
+
+cuda_add_executable(
+	ldc
+		${CMAKE_CURRENT_BINARY_DIR}/ldc.cu
+)
diff --git a/lid_driven_cavity/cuda/config.py b/lid_driven_cavity/cuda/config.py
new file mode 100644
index 0000000..27000e3
--- /dev/null
+++ b/lid_driven_cavity/cuda/config.py
@@ -0,0 +1,15 @@
+from boltzgen.lbm.lattice import *
+from boltzgen import Geometry
+
+descriptor = D2Q9
+geometry   = Geometry(512, 512)
+tau        = 0.52
+precision  = 'single'
+streaming  = 'SSS'
+
+## 3D LDC
+#descriptor = D3Q19
+#geometry   = Geometry(64, 64, 64)
+#tau        = 0.6
+#precision  = 'single'
+#streaming  = 'SSS'
diff --git a/lid_driven_cavity/cuda/generate.py b/lid_driven_cavity/cuda/generate.py
new file mode 100755
index 0000000..27ce511
--- /dev/null
+++ b/lid_driven_cavity/cuda/generate.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import argparse
+
+from boltzgen import Generator, Geometry
+from boltzgen.lbm.model import BGK
+
+import config
+
+argparser = argparse.ArgumentParser(
+    description = 'Generate a CUDA implementation of a lid driven cavity simulation using LBM')
+argparser.add_argument(
+    '--output', required = False, help = 'Target directory for the generated sources')
+
+args = argparser.parse_args()
+
+generator = Generator(
+    model     = BGK(config.descriptor, tau = config.tau),
+    target    = 'cuda',
+    precision = config.precision,
+    streaming = config.streaming,
+    index     = 'XYZ',
+    layout    = 'SOA')
+
+if args.output is None:
+    args.output = '.'
+
+functions = ['collide_and_stream', 'equilibrilize', 'collect_moments', 'momenta_boundary']
+
+if config.streaming == 'SSS':
+    functions += ['update_sss_control_structure']
+
+extras = ['cell_list_dispatch']
+
+with open('%s/kernel.h' % args.output, 'w') as kernel:
+    kernel.write(generator.kernel(config.geometry, functions, extras))
+
+ldc_src = ''
+with open('ldc.cuda.%s.mako' % config.streaming, 'r') as template:
+    ldc_src = template.read()
+
+with open('%s/ldc.cu' % args.output, 'w') as app:
+    app.write(generator.custom(config.geometry, ldc_src))
diff --git a/lid_driven_cavity/cuda/ldc.cuda.SSS.mako b/lid_driven_cavity/cuda/ldc.cuda.SSS.mako
new file mode 100644
index 0000000..2a4aed0
--- /dev/null
+++ b/lid_driven_cavity/cuda/ldc.cuda.SSS.mako
@@ -0,0 +1,221 @@
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include <chrono>
+#include <iostream>
+#include <fstream>
+
+#include "kernel.h"
+
+void write_moments_to_vtk(const std::string& path, ${float_type}* u) {
+    std::ofstream fout;
+    fout.open(path.c_str());
+
+    fout << "# vtk DataFile Version 3.0\n";
+    fout << "lbm_output\n";
+    fout << "ASCII\n";
+    fout << "DATASET RECTILINEAR_GRID\n";
+% if descriptor.d == 2:
+    fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " 1" << "\n";
+% else:
+    fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " " << ${geometry.size_z-2} << "\n";
+% endif
+
+    fout << "X_COORDINATES " << ${geometry.size_x-2} << " float\n";
+    for( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+        fout << x << " ";
+    }
+
+    fout << "\nY_COORDINATES " << ${geometry.size_y-2} << " float\n";
+    for( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+        fout << y << " ";
+    }
+
+% if descriptor.d == 2:
+    fout << "\nZ_COORDINATES " << 1 << " float\n";
+    fout << 0 << "\n";
+    fout << "POINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2)} << "\n";
+% else:
+    fout << "\nZ_COORDINATES " << ${geometry.size_z-2} << " float\n";
+    for( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
+        fout << z << " ";
+    }
+    fout << "\nPOINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2) * (geometry.size_z-2)} << "\n";
+% endif
+
+    fout << "VECTORS velocity float\n";
+% if descriptor.d == 2:
+    for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+        for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+            const std::size_t gid = x*${geometry.size_y}+y;
+            fout << u[gid*${descriptor.d}+0] << " " << u[gid*${descriptor.d}+1] << " 0\n";
+        }
+    }
+% else:
+    for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) {
+        for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) {
+            for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) {
+                const std::size_t gid = x*${geometry.size_y*geometry.size_z}+y*${geometry.size_z}+z;
+                fout << u[gid*${descriptor.d}+0] << " " << u[gid*${descriptor.d}+1] << " " << u[gid*${descriptor.d}+2] << "\n";
+            }
+        }
+    }
+% endif
+
+    fout.close();
+}
+
+void simulate(std::size_t nStep)
+{
+<%
+    padding = (max(geometry.size_x,geometry.size_y,geometry.size_z)+1)**(descriptor.d-1)
+%>
+    ${float_type}* f_aa;
+    cudaMalloc(&f_aa, ${(geometry.volume+2*padding)*descriptor.q}*sizeof(${float_type}));
+
+    ${float_type}** f;
+    cudaMalloc(&f, ${descriptor.q}*sizeof(${float_type}*));
+
+    ${float_type}* device_moments_rho;
+    cudaMalloc(&device_moments_rho, ${geometry.volume} * sizeof(${float_type}));
+    ${float_type}* device_moments_u;
+    cudaMalloc(&device_moments_u, ${geometry.volume*descriptor.d} * sizeof(${float_type}));
+    std::vector<${float_type}> moments_u(${geometry.volume*descriptor.d});
+
+    init_sss_control_structure<<<1,1>>>(f_aa, f);
+    cudaDeviceSynchronize();
+
+    std::vector<std::size_t> ghost;
+    std::vector<std::size_t> bulk;
+    std::vector<std::size_t> lid_bc;
+    std::vector<std::size_t> box_bc;
+
+    for (int iX = 0; iX < ${geometry.size_x}; ++iX) {
+        for (int iY = 0; iY < ${geometry.size_y}; ++iY) {
+% if descriptor.d == 2:
+            const std::size_t iCell = iX*${geometry.size_y} + iY;
+            if (iX == 0 || iY == 0 || iX == ${geometry.size_x-1} || iY == ${geometry.size_y-1}) {
+                ghost.emplace_back(iCell);
+            } else if (iY == ${geometry.size_y-2}) {
+                lid_bc.emplace_back(iCell);
+            } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1) {
+                box_bc.emplace_back(iCell);
+            } else {
+                bulk.emplace_back(iCell);
+            }
+% elif descriptor.d == 3:
+            for (int iZ = 0; iZ < ${geometry.size_z}; ++iZ) {
+                const std::size_t iCell = iX*${geometry.size_y*geometry.size_z} + iY*${geometry.size_z} + iZ;
+                if (   iX == 0 || iY == 0 || iZ == 0
+                    || iX == ${geometry.size_x-1}
+                    || iY == ${geometry.size_y-1}
+                    || iZ == ${geometry.size_z-1}) {
+                    ghost.emplace_back(iCell);
+                } else if (iZ == ${geometry.size_z-2}) {
+                    lid_bc.emplace_back(iCell);
+                } else if (   iX == 1 || iX == ${geometry.size_x-2}
+                           || iY == 1 || iY == ${geometry.size_y-2}
+                           || iZ == 1) {
+                    box_bc.emplace_back(iCell);
+                } else {
+                    bulk.emplace_back(iCell);
+                }
+            }
+% endif
+        }
+    }
+
+    std::cout << "#ghost  : " << ghost.size()  << std::endl;
+    std::cout << "#bulk   : " << bulk.size()   << std::endl;
+    std::cout << "#lid    : " << lid_bc.size() << std::endl;
+    std::cout << "#wall   : " << box_bc.size() << std::endl;
+    std::cout << std::endl;
+
+    std::size_t* device_ghost_cells;
+    std::size_t* device_bulk_cells;
+    std::size_t* device_lid_bc_cells;
+    std::size_t* device_box_bc_cells;
+
+    cudaMalloc(&device_ghost_cells,  ghost.size()  * sizeof(std::size_t));
+    cudaMalloc(&device_bulk_cells,   bulk.size()   * sizeof(std::size_t));
+    cudaMalloc(&device_lid_bc_cells, lid_bc.size() * sizeof(std::size_t));
+    cudaMalloc(&device_box_bc_cells, box_bc.size() * sizeof(std::size_t));
+
+    cudaMemcpy(device_ghost_cells,  ghost.data(),  ghost.size() * sizeof(std::size_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(device_bulk_cells,   bulk.data(),   bulk.size()  * sizeof(std::size_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(device_lid_bc_cells, lid_bc.data(), lid_bc.size()* sizeof(std::size_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(device_box_bc_cells, box_bc.data(), box_bc.size()* sizeof(std::size_t), cudaMemcpyHostToDevice);
+
+    cudaDeviceSynchronize();
+
+    const std::size_t block_size = 32;
+    std::size_t block_count = 0;
+
+    block_count = (ghost.size() + block_size - 1) / block_size;
+    equilibrilize<<<block_count,block_size>>>(f, device_ghost_cells, ghost.size());
+
+    block_count = (bulk.size() + block_size - 1) / block_size;
+    equilibrilize<<<block_count,block_size>>>(f, device_bulk_cells, bulk.size());
+
+    block_count = (box_bc.size() + block_size - 1) / block_size;
+    equilibrilize<<<block_count,block_size>>>(f, device_box_bc_cells, box_bc.size());
+
+    block_count = (lid_bc.size() + block_size - 1) / block_size;
+    equilibrilize<<<block_count,block_size>>>(f, device_lid_bc_cells, lid_bc.size());
+
+    cudaDeviceSynchronize();
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    for (std::size_t iStep = 1; iStep <= nStep; ++iStep) {
+        block_count = (ghost.size() + block_size - 1) / block_size;
+        equilibrilize<<<block_count,block_size>>>(f, device_ghost_cells, ghost.size());
+
+        block_count = (bulk.size() + block_size - 1) / block_size;
+        collide_and_stream<<<block_count,block_size>>>(f, device_bulk_cells, bulk.size());
+
+        block_count = (box_bc.size() + block_size - 1) / block_size;
+% if descriptor.d == 2:
+        velocity_momenta_boundary<<<block_count,block_size>>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0);
+% else:
+        velocity_momenta_boundary<<<block_count,block_size>>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0, 0.0);
+% endif
+
+        block_count = (lid_bc.size() + block_size - 1) / block_size;
+% if descriptor.d == 2:
+        velocity_momenta_boundary<<<block_count,block_size>>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0);
+% else:
+        velocity_momenta_boundary<<<block_count,block_size>>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0, 0.0);
+% endif
+
+        cudaDeviceSynchronize();
+        update_sss_control_structure<<<1,1>>>(f);
+        cudaDeviceSynchronize();
+
+        if (iStep % 1000 == 0) {
+            auto duration = std::chrono::duration_cast<std::chrono::duration<float>>(
+                std::chrono::high_resolution_clock::now() - start);
+                std::cout << "iStep = " << iStep << "; ~" << 1000*${geometry.volume}/(1e6*duration.count()) << " MLUPS" << std::endl;
+
+            block_count = (bulk.size() + block_size - 1) / block_size;
+            collect_moments<<<block_count,block_size>>>(f, device_bulk_cells, bulk.size(), device_moments_rho, device_moments_u);
+            cudaMemcpy(moments_u.data(), device_moments_u, ${geometry.volume*descriptor.d}*sizeof(${float_type}), cudaMemcpyDeviceToHost);
+            write_moments_to_vtk("result/ldc_" + std::to_string(iStep) + ".vtk", moments_u.data());
+
+            start = std::chrono::high_resolution_clock::now();
+        }
+    }
+
+    cudaFree(device_ghost_cells);
+    cudaFree(device_bulk_cells);
+    cudaFree(device_lid_bc_cells);
+    cudaFree(device_box_bc_cells);
+    cudaFree(device_moments_rho);
+    cudaFree(device_moments_u);
+    cudaFree(f);
+    cudaFree(f_aa);
+}
+
+int main() {
+    simulate(20000);
+}
diff --git a/shell.nix b/shell.nix
index cdca488..d6d26c6 100644
--- a/shell.nix
+++ b/shell.nix
@@ -27,7 +27,7 @@ pkgs.stdenvNoCC.mkDerivation rec {
 
       src = builtins.fetchGit {
         url = "https://code.kummerlaender.eu/boltzgen/";
-        rev = "d136bb30bc8a9393372ec905aea500a0b61000e3";
+        rev = "4a2885ad3ae0396486d288df94339d0c45e6db8b";
       };
 
       propagatedBuildInputs = with pkgs.python37Packages; [
@@ -52,11 +52,14 @@ pkgs.stdenvNoCC.mkDerivation rec {
     opencl-info
     gcc9
     cmake
+    cudatoolkit
+    linuxPackages.nvidia_x11 
   ];
 
   shellHook = ''
     export NIX_SHELL_NAME="${name}"
     export PYOPENCL_COMPILER_OUTPUT=1
+    export CUDA_PATH="${pkgs.cudatoolkit}"
     export PYTHONPATH="$PWD:$PYTHONPATH"
   '';
 }
-- 
cgit v1.2.3