diff options
71 files changed, 10453 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..abe1087 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 3.10) +project(literatelb LANGUAGES CXX CUDA) + +find_package(CUDAToolkit REQUIRED) +find_package(SFML 2.5 COMPONENTS graphics window system REQUIRED) +find_package(ImGui-SFML REQUIRED) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream per-thread --extended-lambda") +set(CMAKE_CUDA_ARCHITECTURES 50) + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/tangle + ${CMAKE_CURRENT_BINARY_DIR}) + +include(assets.cmake) +serialize_assets(${CMAKE_CURRENT_SOURCE_DIR}/tangle/asset + ${CMAKE_CURRENT_BINARY_DIR}/assets.cc + ${CMAKE_CURRENT_BINARY_DIR}/assets.h) +add_library(assets ${CMAKE_CURRENT_BINARY_DIR}/assets.cc) + +link_libraries( + sfml-graphics + sfml-window + sfml-system + ImGui-SFML::ImGui-SFML + CUDA::cuda_driver + assets) + +file(GLOB EXAMPLES ${CMAKE_CURRENT_SOURCE_DIR}/tangle/*.cu) +foreach(examplefile ${EXAMPLES}) + get_filename_component(examplename ${examplefile} NAME_WE) + add_executable(${examplename} ${examplefile}) +endforeach() diff --git a/assets.cmake b/assets.cmake new file mode 100644 index 0000000..11868f8 --- /dev/null +++ b/assets.cmake @@ -0,0 +1,42 @@ +function(serialize_folder name dir data header) + file(APPEND ${header} "namespace ${name} {\n") + file(APPEND ${data} "namespace ${name} {\n") + file(GLOB files ${dir}/*) + list(LENGTH files file_count) + foreach(file ${files}) + string(REGEX MATCH "([^/]+)$" filename ${file}) + string(REGEX REPLACE "\\.| |-" "_" filename ${filename}) + file(READ ${file} filedata HEX) + string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata}) + file(APPEND ${data} "const unsigned char file_${filename}[] = {${filedata}};\n") + file(APPEND ${data} "const unsigned file_${filename}_size = sizeof(file_${filename});\n") + file(APPEND ${header} "extern const unsigned char file_${filename}[];\n") + file(APPEND ${header} "extern const unsigned file_${filename}_size;\n") + endforeach() + file(APPEND ${data} "}\n") + file(APPEND ${header} "const File files[] {\n") + foreach(file ${files}) + string(REGEX MATCH "([^/]+)$" filename ${file}) + string(REGEX REPLACE "\\.| |-" "_" flat_filename ${filename}) + file(APPEND ${header} "File{\"${filename}\", file_${flat_filename}, file_${flat_filename}_size},\n") + endforeach() + file(APPEND ${header} "};\n") + file(APPEND ${header} "const unsigned file_count = ${file_count};\n") + file(APPEND ${header} "}\n") +endfunction() + +function(serialize_assets dir data header) + file(WRITE ${data} "") + file(APPEND ${data} "#include \"${header}\"\n") + file(APPEND ${data} "namespace assets {\n") + file(WRITE ${header} "") + file(APPEND ${header} "#pragma once\n") + file(APPEND ${header} "#include <string>\n") + file(APPEND ${header} "namespace assets {\n") + file(APPEND ${header} "struct File { const std::string name; const unsigned char* data; const unsigned size; };\n") + serialize_folder("palette" ${dir}/palette ${data} ${header}) + serialize_folder("noise" ${dir}/noise ${data} ${header}) + serialize_folder("shader" ${dir}/shader ${data} ${header}) + file(APPEND ${data} "}\n") + file(APPEND ${header} "}\n") +endfunction() diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..6d19d77 --- /dev/null +++ b/default.nix @@ -0,0 +1,152 @@ +{ pkgs ? import <nixpkgs> { + overlays = [ + (import (builtins.fetchTarball { + url = https://github.com/nix-community/emacs-overlay/archive/0bb3c36bb8cddd92b788d8ce474c39475148d5e2.tar.gz; + })) + ]; +}, ... }: + +let + cuda-samples-common-headers = pkgs.stdenv.mkDerivation rec { + name = "cuda-samples-common-headers"; + version = "11.1"; + + src = pkgs.fetchFromGitHub { + owner = "NVIDIA"; + repo = "cuda-samples"; + rev = "v${version}"; + sha256 = "1kjixk50i8y1bkiwbdn5lkv342crvkmbvy1xl5j3lsa1ica21kwh"; + }; + + phases = [ "installPhase" ]; + + installPhase = '' + mkdir -p $out/include/cuda-samples/Common + cp -r $src/Common/* $out/include/cuda-samples/Common + ''; + }; + + imgui-sfml = pkgs.stdenv.mkDerivation rec { + name = "imgui-sfml"; + version = "2.1"; + + src = pkgs.fetchFromGitHub { + owner = "eliasdaler"; + repo = "imgui-sfml"; + rev = "v${version}"; + sha256 = "1g8gqly156miv12ajapnhmxfcv9i3fqhdmdy45gmdw47kh8ly5zj"; + }; + + buildInputs = with pkgs; [ + cmake + ]; + + propagatedBuildInputs = with pkgs; [ + libGL + sfml + xorg.libX11 + ]; + + cmakeFlags = let + imgui_src = pkgs.fetchFromGitHub { + owner = "ocornut"; + repo = "imgui"; + rev = "v1.68"; + sha256 = "0a7b4fljybvpls84rqzsb2p4r89ic2g6w2m9h0209xlhm4k0x7qr"; + }; + in [ + "-DIMGUI_DIR=${imgui_src}" + "-DBUILD_SHARED_LIBS=ON" + "-DIMGUI_SFML_BUILD_EXAMPLES=ON" + ]; + }; + +in pkgs.stdenv.mkDerivation rec { + name = "LiterateLB"; + + src = ./.; + + nativeBuildInputs = with pkgs; [ + cmake + addOpenGLRunpath + ]; + + buildInputs = let + local-python = pkgs.python3.withPackages (python-packages: with python-packages; [ + sympy + numpy + Mako + matplotlib + scipy + ]); + + in with pkgs; [ + local-python + cudatoolkit_11 + cuda-samples-common-headers + linuxPackages.nvidia_x11 + libGL + sfml + imgui-sfml + ]; + + phases = [ "buildPhase" "installPhase" ]; + + buildPhase = let + tangle-el = pkgs.writeTextFile { + name = "tangle.el"; + text = '' + (toggle-debug-on-error) + (require 'org) + (org-babel-do-load-languages 'org-babel-load-languages '((python . t))) + (setq org-confirm-babel-evaluate nil) + (setq org-babel-confirm-evaluate-answer-no t) + (find-file "lbm.org") + (setq org-babel-default-header-args + (cons '(:result-params . ("none")) + (assq-delete-all :result-params org-babel-default-header-args))) + (org-babel-execute-buffer) + (setq org-babel-default-header-args + (cons '(:result-params . ("output")) + (assq-delete-all :result-params org-babel-default-header-args))) + (org-babel-tangle) + ''; + }; + + in '' + cp $src/lbm.org . + mkdir -p tangle/asset + cp -r $src/tangle/asset/noise tangle/asset + cp -r $src/tangle/asset/palette tangle/asset + ${pkgs.emacsUnstable-nox}/bin/emacs --no-init-file --script ${tangle-el} + cp $src/CMakeLists.txt . + cp $src/assets.cmake . + mkdir build + pushd build + cmake .. + make + popd + ''; + + installPhase = '' + mkdir -p $out/bin + for program in tangle/*.cu; do + cp build/$(basename $program .cu) $out/bin/$(basename $program .cu) + addOpenGLRunpath $out/bin/$(basename $program .cu) + done + + mkdir $out/include + cp -r tangle/LLBM $out/include/ + ''; + + env = pkgs.buildEnv { + name = name; + paths = buildInputs; + }; + + shellHook = '' + export NIX_SHELL_NAME="${name}" + export CUDA_PATH="${pkgs.cudatoolkit_11}" + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/run/opengl-driver/lib + ''; +} @@ -0,0 +1,5817 @@ +#+TITLE: A literate Lattice Boltzmann Code +#+SUBTITLE: [[https://kummerlaender.eu][Adrian Kummerländer]] +#+STARTUP: latexpreview +#+HTML_DOCTYPE: html5 +#+OPTIONS: toc:nil html-postamble:nil html5-fancy:t html-style:nil +#+PROPERTY: header-args :exports both :mkdirp yes :noweb no-export :eval no-export +#+PROPERTY: header-args:python+ :var lattice="D2Q9" +#+PROPERTY: header-args:cpp+ :main no :eval no +#+HTML_MATHJAX: path:"https://static.kummerlaender.eu/mathjax/MathJax.js?config=TeX-AMS_HTML" + +#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="org.css"/> + +#+BEGIN_ABSTRACT +This file describes a full Lattice Boltzmann code featuring both 2D and 3D lattices, a workable selection of boundary conditions, Smagorinsky +turbulence modelling, expression-level code optimization and even a full ray marcher for just-in-time volumetric visualization in addition +to a set of interesting examples. All of this runs on GPUs using CUDA near the maximum possible performance on that platform. + +*This document is a [[*Open tasks][work in progress]].* +#+BEGIN_EXPORT html +<video style="width:100%" src="https://literatelb.org/talk/fire.webm" onloadeddata="this.play();" playsinline loop muted/> +#+END_EXPORT +#+END_ABSTRACT +#+TOC: headlines 2 + +* Preamble :noexport: +#+BEGIN_SRC python :session :results none +from sympy import * +from sympy.codegen.ast import CodeBlock, Assignment +from mako.template import Template +import itertools + +def printlatexpr(*exprs): + print("$$\\begin{align*}") + for expr in exprs: + print(f"{latex(expr.lhs)} &:= {latex(expr.rhs)} \\\\") + + print("\\end{align*}$$") + +def multirange(*ranges): + return itertools.product(*[ range(r) for r in ranges ]) + +def multisum(f, *ranges): + return sum([ f(*idx) for idx in multirange(*ranges) ]) + +descriptor = { } +#+END_SRC + +#+NAME: eval-using-descriptor +#+BEGIN_SRC python :session :results output :var src="" +if lattice in descriptor: + D = descriptor[lattice] + print(Template(src).render(**locals())) +#+END_SRC + +#+RESULTS: eval-using-descriptor + +* Introduction +Approaches to modelling fluid dynamics can be coarsely grouped into three categories: Microscopic, Mesoscopic and Macroscopic models. +Microscopic models describe the behavior of the individual /particles/ of whose the macroscopic fluid movement is an emergent property. +On the other side of the spectrum, macroscopic models consider only the large scale properties of a fluid such as its velocity and density +fields. One could say that microscopic particle models are closest to what actually happens in nature and macroscopic models in the form +of the /Navier Stokes equations (NSE)/ represent an ideal vision of a fluid that only holds for /large enough/ space scales. +As is the case for most things, what we call a /fluid/ is by itself already an abstraction. + +Mesoscopic models sit between these two extremes and consider neither individual particles nor purely macroscopic properties but rather +the probability of some amount of particles moving with some velocity into a certain direction at various points in time and space. The +mathematical field of /kinetic theory/ provides a foundation for linking all of these three models. + +** Kinetic theory +One can use kinetic theory to get from a microscropic or /molecular/ system governed by e.g. Newton's laws of motion to macroscopic +descriptions of certain properties of such a system. For example one can reach the diffusion equation for /billiard systems/ of particles +colliding with solid obstacles or to the Navier Stokes equations for systems of pairwise colliding particles. + +In very rough terms one starts out by considering the microscopic system and taking its limit for infinitely many particles of +vanishingly small size. This yields a /kinetic equation/ that describes the behaviour of the particle distribution function +under some collision operator for any point in space and time. When we consider pairwise colliding particles the result +turns out to be the well known /Boltzmann transport equation/: + +$$(\partial_t + v \cdot \nabla_x) f(t,x,v) = \Omega (f).$$ + +This PDE describes the behavior of the probability $f(t,x,v)$ that particles move into direction $v$ at location $x$ and time $t$ for +some collision operator $\Omega$. Discretizing this equation on a lattice for a finite set of discrete velocities is called the /Lattice Boltzmann Method (LBM)/. + +The practical usefulness of LBM hinges on the ability of reaching the desired /target equations/ +such as NSE or the heat equation as a limit of the kinetic equation. + +** Lattice Boltzmann Method +We discretize the Boltzmann equation in spatial, velocity and temporal space on a cartesian lattice using discrete velocities +$\xi_i$ that point to the neighbors of each cell. + +$$(\partial_t + v \cdot \nabla_x)f = \Omega(f) \approx f_i(x + \xi_i, t + 1) - f_i(x,t) = \Omega_i(x,t)$$ + +A common collision operator $\Omega$ for Navier-Stokes approximation is given by Bhatnagar, Gross and Kroog (BGK). + +$$\Omega(f) := -\frac{f - f^\text{eq}}{\tau}$$ + +This BGK operator /relaxes/ the local population $f$ towards some equilibrium distribution $f^\text{eq}$ with rate $\tau$. Inserting this operator +into the discrete Boltzmann equation yields the discrete LBM BGK equation. + +$$f_i(x + \xi_i, t + 1) = f_i(x,t) - \frac{1}{\tau} (f_i(x,t) - f_i^\text{eq}(x,t))$$ + +This explicit form exposes the separation of the LBM algorithm into a local collision step followed by a streaming step. +The collision step relaxes the population of each cell towards its local equilibrium and the streaming step propagates the resulting +updated population values to the respective neighboring cells. + +Macroscopic moments such as density and velocity can be calculated directly from the distribution functions. + +$$\rho := \sum_{i=0}^{q-1} f_i(x,t) \text{ and } u := \frac{1}{\rho} \sum_{i=0}^{q-1} \xi_i f_i(x,t)$$ + +The equilibrium distribution $f_i^\text{eq}$ for these macroscopic properties is given by + +$$f_i^\text{eq}(x,t) := \omega_i \rho \left( 1 + \frac{u \cdot \xi_i}{c_s^2} + \frac{(u \cdot \xi_i)^2}{2c_s^4} - \frac{u \cdot u}{2c_s^2} \right)$$ + +using lattice-dependent weights $\omega_i$ and speed of sound $c_s$. Note that this is just one possible discretization of the Boltzmann +equation -- nothing stops us from using different sets of velocities, relaxation times, collision operators and so on. +Changing these things is how different physics are modeled in LBM. e.g. what we will do in the section on +[[*Smagorinsky BGK collision][Smagorinsky turbulence modelling]] is to locally change the relaxation time. + +To summarize what we are going to do for the simplest bulk fluid case: First calculate the current density and velocity +moments of a cell, then compute the matching equilibrium distributions and finally perform the local BGK collision to +update the cell populations. +The last step before we start again is to propagate the post-collision values to the corresponding neighbor cells. + +Special care has to be taken for the boundary conditions at the lattice frontier and around any obstacle geometries. +Such boundary conditions are one of the major topics of LBM research with a very rich toolbox of specialized collision +steps for modeling e.g. inflow, outflow, solid or moving walls of various kinds. + +* Lattice +The cartesian grids used for the spatial discretization are commonly described as =DXQY= lattices where =X= is the number of spatial dimensions +and =Y= is the number of discrete velocities $\xi_i$. e.g. a =D2Q9= lattice is two dimensional and stores nine population values per cell. Each population has +an associated weigth $\omega_i$ that in a sense controls its impact on the collision step. Additionally we also require the lattice speed of sound $c_s$ which is +the speed of information propagation within the lattice. + +#+BEGIN_SRC python :session :results none +from fractions import Fraction + +class Descriptor: + def __init__(self, name, data): + self.name = name + self.c = [ Matrix(eval(row[0])) for row in data ] + self.w = [ Rational(f.numerator, f.denominator) + for f in [ Fraction(row[1]) for row in data ] ] + + self.d = self.c[0].shape[0] + self.q = len(self.c) + self.c_s = sqrt(Rational(1,3)) +#+END_SRC + +All of these constants have to be accessible for symbolic computations which is why we store them within a so called descriptor class. +For convenience we write out the directions and weights as plain tables that are then read into the Python structure. + +#+NAME: load-descriptor +#+BEGIN_SRC python :session :results output :var data=D2Q9 +descriptor[lattice] = Descriptor(lattice, data) +print(f"Loaded D{descriptor[lattice].d}Q{descriptor[lattice].q} lattice with a weight sum of {sum(descriptor[lattice].w)}.") +#+END_SRC + +#+RESULTS: load-descriptor +: Loaded D2Q9 lattice with a weight sum of 1. + +#+CALL: load-descriptor(lattice="D3Q19", data=D3Q19) + +#+RESULTS: +: Loaded D3Q19 lattice with a weight sum of 1. + +** D2Q9 +#+NAME: D2Q9 +| Direction | Weight | +|-----------+--------| +| (-1,-1) | 1/36 | +| (-1, 0) | 1/9 | +| (-1, 1) | 1/36 | +| ( 0,-1) | 1/9 | +| ( 0, 0) | 4/9 | +| ( 0, 1) | 1/9 | +| ( 1,-1) | 1/36 | +| ( 1, 0) | 1/9 | +| ( 1, 1) | 1/36 | + +** D3Q19 +#+NAME: D3Q19 +| Direction | Weight | +|------------+--------| +| ( 0, 1, 1) | 1/36 | +| (-1, 0, 1) | 1/36 | +| ( 0, 0, 1) | 1/18 | +| ( 1, 0, 1) | 1/36 | +| ( 0,-1, 1) | 1/36 | +| (-1, 1, 0) | 1/36 | +| ( 0, 1, 0) | 1/18 | +| ( 1, 1, 0) | 1/36 | +| (-1, 0, 0) | 1/18 | +| ( 0, 0, 0) | 1/3 | +| ( 1, 0, 0) | 1/18 | +| (-1,-1, 0) | 1/36 | +| ( 0,-1, 0) | 1/18 | +| ( 1,-1, 0) | 1/36 | +| ( 0, 1,-1) | 1/36 | +| (-1, 0,-1) | 1/36 | +| ( 0, 0,-1) | 1/18 | +| ( 1, 0,-1) | 1/36 | +| ( 0,-1,-1) | 1/36 | + +* Collision steps +While the streaming step of the LBM algorithm only propagates the populations between cells in an unform fashion, +the collision step determines the actual values those populations take. This means that the physical behaviour +modelled by a given LBM algorithm is determined primarily by the collision step. + +In this section we are going to generate the code for bulk collisions. i.e. the collisions that model fluid and other transport +phenomena apart from domain boundaries. Those will be handled at a later point by special purpose collision steps called +/boundary conditions/. + +** Code printing +Before we can get started on constructing expression trees and generating code from them we need to +setup some basics so SymPy actually generates something we can compile in our environment. + +In order to get more fine grained control we need to overload an approproate SymPy C code printer class. +This allows us to e.g. easily print indexed expressions that access the population array in the correct way +or to explicitly type float constants according to a compile-time template type. + +#+BEGIN_SRC python :session :results none +from sympy.printing.ccode import C11CodePrinter + +class CodeBlockPrinter(C11CodePrinter): + def __init__(self, custom_assignment, custom_functions): + super(CodeBlockPrinter, self).__init__() + self._default_settings['contract'] = False + self.custom_assignment = custom_assignment + for f in custom_functions: + self._kf[f] = f + + def _print_Indexed(self, expr): + assert len(expr.indices) == 1 + if expr.base.name[0] == 'f': + return f"{expr.base.name}[{expr.indices[0]}]" + else: + return f"{expr.base.name}_{expr.indices[0]}" + + def _print_Float(self, flt): + return "T{%s}" % str(flt.evalf()) + + def _print_Pow(self, expr): + if expr.exp == -1: + return "T{1} / (%s)" % self.doprint(expr.base) + else: + return super()._print_Pow(expr) + + def _print_Assignment(self, expr): + if self.custom_assignment and expr.lhs.is_Indexed and expr.lhs.base.name[0] == 'f': + return f"{self.doprint(expr.lhs)} = {self.doprint(expr.rhs.evalf())};" + else: + return f"T {self.doprint(expr.lhs)} = {self.doprint(expr.rhs.evalf())};" +#+END_SRC + +For convenience the instantiation of this class is hidden in a =printcode= function that we can use everywhere. + +#+BEGIN_SRC python :session :results none +def printcode(expr, custom_assignment=True, custom_functions=[]): + print(CodeBlockPrinter(custom_assignment, custom_functions).doprint(expr)) +#+END_SRC + +The additional assignment parameter allow us to control whether the targets of assignment expressions should be +instantiated as a new scalar variable in addition while the functions parameter allows us to make custom runtime +functions known to SymPy. If we don't do this for a function =f= that our assignment expression contains a reference +to, the printer will generate unwanted comments to reflect this. + +*** Custom expression transformations +We do not want to use the =pow= function for squares in the generated code. This can be achieved by providing +a custom =ReplaceOptim= structure during the CSE optimization step that conditionally resolves =Pow= expressions. + +#+BEGIN_SRC python :session :results none +from sympy.codegen.rewriting import ReplaceOptim + +expand_pos_square = ReplaceOptim( + lambda e: e.is_Pow and e.exp.is_integer and e.exp == 2, + lambda p: UnevaluatedExpr(Mul(p.base, p.base, evaluate = False)) +) + +custom_opti = cse_main.basic_optimizations + [ + (expand_pos_square, expand_pos_square) +] +#+END_SRC + +#+BEGIN_SRC python :session :results none +def cse(block, symbol_prefix = 'x'): + return block.cse(symbols = numbered_symbols(prefix=symbol_prefix), optimizations = custom_opti, order = 'none') +#+END_SRC + +** Moments +#+BEGIN_SRC python :session :results none +i, j = symbols('i, j') +d, q = symbols('d, q') +xi = IndexedBase('xi') +#+END_SRC + +To start we define placeholders for the spatial and discrete velocity dimensions as well as the velocity set $\xi$ of +some lattice. As the moments are constructed using the lattice populations $f$ we also require placeholders +for those in addition to the moments $\rho$ and $u$ themselves. + +#+BEGIN_SRC python :session :results none +f = IndexedBase('f') +rho = Symbol('rho') +u = IndexedBase('u |