diff options
Add LDC example for CUDA target
-rw-r--r-- | lid_driven_cavity/cuda/CMakeLists.txt | 30 | ||||
-rw-r--r-- | lid_driven_cavity/cuda/config.py | 15 | ||||
-rwxr-xr-x | lid_driven_cavity/cuda/generate.py | 43 | ||||
-rw-r--r-- | lid_driven_cavity/cuda/ldc.cuda.SSS.mako | 221 | ||||
-rw-r--r-- | shell.nix | 5 |
5 files changed, 313 insertions, 1 deletions
diff --git a/lid_driven_cavity/cuda/CMakeLists.txt b/lid_driven_cavity/cuda/CMakeLists.txt new file mode 100644 index 0000000..7c920f2 --- /dev/null +++ b/lid_driven_cavity/cuda/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.10) +project(ldc LANGUAGES CXX) +find_package(CUDA QUIET REQUIRED) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +add_custom_command( + OUTPUT + kernel.h + ldc.cu + COMMAND + ${CMAKE_CURRENT_SOURCE_DIR}/generate.py --output ${CMAKE_CURRENT_BINARY_DIR} + WORKING_DIRECTORY + ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS + generate.py + config.py + ldc.cuda.SSS.mako +) + +include_directories( + ${CMAKE_BINARY_DIR} +) + +cuda_add_executable( + ldc + ${CMAKE_CURRENT_BINARY_DIR}/ldc.cu +) diff --git a/lid_driven_cavity/cuda/config.py b/lid_driven_cavity/cuda/config.py new file mode 100644 index 0000000..27000e3 --- /dev/null +++ b/lid_driven_cavity/cuda/config.py @@ -0,0 +1,15 @@ +from boltzgen.lbm.lattice import * +from boltzgen import Geometry + +descriptor = D2Q9 +geometry = Geometry(512, 512) +tau = 0.52 +precision = 'single' +streaming = 'SSS' + +## 3D LDC +#descriptor = D3Q19 +#geometry = Geometry(64, 64, 64) +#tau = 0.6 +#precision = 'single' +#streaming = 'SSS' diff --git a/lid_driven_cavity/cuda/generate.py b/lid_driven_cavity/cuda/generate.py new file mode 100755 index 0000000..27ce511 --- /dev/null +++ b/lid_driven_cavity/cuda/generate.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import argparse + +from boltzgen import Generator, Geometry +from boltzgen.lbm.model import BGK + +import config + +argparser = argparse.ArgumentParser( + description = 'Generate a CUDA implementation of a lid driven cavity simulation using LBM') +argparser.add_argument( + '--output', required = False, help = 'Target directory for the generated sources') + +args = argparser.parse_args() + +generator = Generator( + model = BGK(config.descriptor, tau = config.tau), + target = 'cuda', + precision = config.precision, + streaming = config.streaming, + index = 'XYZ', + layout = 'SOA') + +if args.output is None: + args.output = '.' + +functions = ['collide_and_stream', 'equilibrilize', 'collect_moments', 'momenta_boundary'] + +if config.streaming == 'SSS': + functions += ['update_sss_control_structure'] + +extras = ['cell_list_dispatch'] + +with open('%s/kernel.h' % args.output, 'w') as kernel: + kernel.write(generator.kernel(config.geometry, functions, extras)) + +ldc_src = '' +with open('ldc.cuda.%s.mako' % config.streaming, 'r') as template: + ldc_src = template.read() + +with open('%s/ldc.cu' % args.output, 'w') as app: + app.write(generator.custom(config.geometry, ldc_src)) diff --git a/lid_driven_cavity/cuda/ldc.cuda.SSS.mako b/lid_driven_cavity/cuda/ldc.cuda.SSS.mako new file mode 100644 index 0000000..2a4aed0 --- /dev/null +++ b/lid_driven_cavity/cuda/ldc.cuda.SSS.mako @@ -0,0 +1,221 @@ +#include <cstdint> +#include <memory> +#include <vector> +#include <chrono> +#include <iostream> +#include <fstream> + +#include "kernel.h" + +void write_moments_to_vtk(const std::string& path, ${float_type}* u) { + std::ofstream fout; + fout.open(path.c_str()); + + fout << "# vtk DataFile Version 3.0\n"; + fout << "lbm_output\n"; + fout << "ASCII\n"; + fout << "DATASET RECTILINEAR_GRID\n"; +% if descriptor.d == 2: + fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " 1" << "\n"; +% else: + fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " " << ${geometry.size_z-2} << "\n"; +% endif + + fout << "X_COORDINATES " << ${geometry.size_x-2} << " float\n"; + for( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + fout << x << " "; + } + + fout << "\nY_COORDINATES " << ${geometry.size_y-2} << " float\n"; + for( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + fout << y << " "; + } + +% if descriptor.d == 2: + fout << "\nZ_COORDINATES " << 1 << " float\n"; + fout << 0 << "\n"; + fout << "POINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2)} << "\n"; +% else: + fout << "\nZ_COORDINATES " << ${geometry.size_z-2} << " float\n"; + for( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) { + fout << z << " "; + } + fout << "\nPOINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2) * (geometry.size_z-2)} << "\n"; +% endif + + fout << "VECTORS velocity float\n"; +% if descriptor.d == 2: + for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + const std::size_t gid = x*${geometry.size_y}+y; + fout << u[gid*${descriptor.d}+0] << " " << u[gid*${descriptor.d}+1] << " 0\n"; + } + } +% else: + for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) { + for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + const std::size_t gid = x*${geometry.size_y*geometry.size_z}+y*${geometry.size_z}+z; + fout << u[gid*${descriptor.d}+0] << " " << u[gid*${descriptor.d}+1] << " " << u[gid*${descriptor.d}+2] << "\n"; + } + } + } +% endif + + fout.close(); +} + +void simulate(std::size_t nStep) +{ +<% + padding = (max(geometry.size_x,geometry.size_y,geometry.size_z)+1)**(descriptor.d-1) +%> + ${float_type}* f_aa; + cudaMalloc(&f_aa, ${(geometry.volume+2*padding)*descriptor.q}*sizeof(${float_type})); + + ${float_type}** f; + cudaMalloc(&f, ${descriptor.q}*sizeof(${float_type}*)); + + ${float_type}* device_moments_rho; + cudaMalloc(&device_moments_rho, ${geometry.volume} * sizeof(${float_type})); + ${float_type}* device_moments_u; + cudaMalloc(&device_moments_u, ${geometry.volume*descriptor.d} * sizeof(${float_type})); + std::vector<${float_type}> moments_u(${geometry.volume*descriptor.d}); + + init_sss_control_structure<<<1,1>>>(f_aa, f); + cudaDeviceSynchronize(); + + std::vector<std::size_t> ghost; + std::vector<std::size_t> bulk; + std::vector<std::size_t> lid_bc; + std::vector<std::size_t> box_bc; + + for (int iX = 0; iX < ${geometry.size_x}; ++iX) { + for (int iY = 0; iY < ${geometry.size_y}; ++iY) { +% if descriptor.d == 2: + const std::size_t iCell = iX*${geometry.size_y} + iY; + if (iX == 0 || iY == 0 || iX == ${geometry.size_x-1} || iY == ${geometry.size_y-1}) { + ghost.emplace_back(iCell); + } else if (iY == ${geometry.size_y-2}) { + lid_bc.emplace_back(iCell); + } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1) { + box_bc.emplace_back(iCell); + } else { + bulk.emplace_back(iCell); + } +% elif descriptor.d == 3: + for (int iZ = 0; iZ < ${geometry.size_z}; ++iZ) { + const std::size_t iCell = iX*${geometry.size_y*geometry.size_z} + iY*${geometry.size_z} + iZ; + if ( iX == 0 || iY == 0 || iZ == 0 + || iX == ${geometry.size_x-1} + || iY == ${geometry.size_y-1} + || iZ == ${geometry.size_z-1}) { + ghost.emplace_back(iCell); + } else if (iZ == ${geometry.size_z-2}) { + lid_bc.emplace_back(iCell); + } else if ( iX == 1 || iX == ${geometry.size_x-2} + || iY == 1 || iY == ${geometry.size_y-2} + || iZ == 1) { + box_bc.emplace_back(iCell); + } else { + bulk.emplace_back(iCell); + } + } +% endif + } + } + + std::cout << "#ghost : " << ghost.size() << std::endl; + std::cout << "#bulk : " << bulk.size() << std::endl; + std::cout << "#lid : " << lid_bc.size() << std::endl; + std::cout << "#wall : " << box_bc.size() << std::endl; + std::cout << std::endl; + + std::size_t* device_ghost_cells; + std::size_t* device_bulk_cells; + std::size_t* device_lid_bc_cells; + std::size_t* device_box_bc_cells; + + cudaMalloc(&device_ghost_cells, ghost.size() * sizeof(std::size_t)); + cudaMalloc(&device_bulk_cells, bulk.size() * sizeof(std::size_t)); + cudaMalloc(&device_lid_bc_cells, lid_bc.size() * sizeof(std::size_t)); + cudaMalloc(&device_box_bc_cells, box_bc.size() * sizeof(std::size_t)); + + cudaMemcpy(device_ghost_cells, ghost.data(), ghost.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(device_bulk_cells, bulk.data(), bulk.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(device_lid_bc_cells, lid_bc.data(), lid_bc.size()* sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(device_box_bc_cells, box_bc.data(), box_bc.size()* sizeof(std::size_t), cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + + const std::size_t block_size = 32; + std::size_t block_count = 0; + + block_count = (ghost.size() + block_size - 1) / block_size; + equilibrilize<<<block_count,block_size>>>(f, device_ghost_cells, ghost.size()); + + block_count = (bulk.size() + block_size - 1) / block_size; + equilibrilize<<<block_count,block_size>>>(f, device_bulk_cells, bulk.size()); + + block_count = (box_bc.size() + block_size - 1) / block_size; + equilibrilize<<<block_count,block_size>>>(f, device_box_bc_cells, box_bc.size()); + + block_count = (lid_bc.size() + block_size - 1) / block_size; + equilibrilize<<<block_count,block_size>>>(f, device_lid_bc_cells, lid_bc.size()); + + cudaDeviceSynchronize(); + + auto start = std::chrono::high_resolution_clock::now(); + + for (std::size_t iStep = 1; iStep <= nStep; ++iStep) { + block_count = (ghost.size() + block_size - 1) / block_size; + equilibrilize<<<block_count,block_size>>>(f, device_ghost_cells, ghost.size()); + + block_count = (bulk.size() + block_size - 1) / block_size; + collide_and_stream<<<block_count,block_size>>>(f, device_bulk_cells, bulk.size()); + + block_count = (box_bc.size() + block_size - 1) / block_size; +% if descriptor.d == 2: + velocity_momenta_boundary<<<block_count,block_size>>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0); +% else: + velocity_momenta_boundary<<<block_count,block_size>>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0, 0.0); +% endif + + block_count = (lid_bc.size() + block_size - 1) / block_size; +% if descriptor.d == 2: + velocity_momenta_boundary<<<block_count,block_size>>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0); +% else: + velocity_momenta_boundary<<<block_count,block_size>>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0, 0.0); +% endif + + cudaDeviceSynchronize(); + update_sss_control_structure<<<1,1>>>(f); + cudaDeviceSynchronize(); + + if (iStep % 1000 == 0) { + auto duration = std::chrono::duration_cast<std::chrono::duration<float>>( + std::chrono::high_resolution_clock::now() - start); + std::cout << "iStep = " << iStep << "; ~" << 1000*${geometry.volume}/(1e6*duration.count()) << " MLUPS" << std::endl; + + block_count = (bulk.size() + block_size - 1) / block_size; + collect_moments<<<block_count,block_size>>>(f, device_bulk_cells, bulk.size(), device_moments_rho, device_moments_u); + cudaMemcpy(moments_u.data(), device_moments_u, ${geometry.volume*descriptor.d}*sizeof(${float_type}), cudaMemcpyDeviceToHost); + write_moments_to_vtk("result/ldc_" + std::to_string(iStep) + ".vtk", moments_u.data()); + + start = std::chrono::high_resolution_clock::now(); + } + } + + cudaFree(device_ghost_cells); + cudaFree(device_bulk_cells); + cudaFree(device_lid_bc_cells); + cudaFree(device_box_bc_cells); + cudaFree(device_moments_rho); + cudaFree(device_moments_u); + cudaFree(f); + cudaFree(f_aa); +} + +int main() { + simulate(20000); +} @@ -27,7 +27,7 @@ pkgs.stdenvNoCC.mkDerivation rec { src = builtins.fetchGit { url = "https://code.kummerlaender.eu/boltzgen/"; - rev = "d136bb30bc8a9393372ec905aea500a0b61000e3"; + rev = "4a2885ad3ae0396486d288df94339d0c45e6db8b"; }; propagatedBuildInputs = with pkgs.python37Packages; [ @@ -52,11 +52,14 @@ pkgs.stdenvNoCC.mkDerivation rec { opencl-info gcc9 cmake + cudatoolkit + linuxPackages.nvidia_x11 ]; shellHook = '' export NIX_SHELL_NAME="${name}" export PYOPENCL_COMPILER_OUTPUT=1 + export CUDA_PATH="${pkgs.cudatoolkit}" export PYTHONPATH="$PWD:$PYTHONPATH" ''; } |