From 682c46c7932f6dfe8d3874bd54e412162a81d216 Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Fri, 17 Jan 2020 21:07:49 +0100 Subject: Integrate AA in CUDA example --- lid_driven_cavity/cuda/CMakeLists.txt | 1 + lid_driven_cavity/cuda/ldc.cuda.AA.mako | 249 ++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+) create mode 100644 lid_driven_cavity/cuda/ldc.cuda.AA.mako (limited to 'lid_driven_cavity') diff --git a/lid_driven_cavity/cuda/CMakeLists.txt b/lid_driven_cavity/cuda/CMakeLists.txt index 7c920f2..fba2047 100644 --- a/lid_driven_cavity/cuda/CMakeLists.txt +++ b/lid_driven_cavity/cuda/CMakeLists.txt @@ -17,6 +17,7 @@ add_custom_command( DEPENDS generate.py config.py + ldc.cuda.AA.mako ldc.cuda.SSS.mako ) diff --git a/lid_driven_cavity/cuda/ldc.cuda.AA.mako b/lid_driven_cavity/cuda/ldc.cuda.AA.mako new file mode 100644 index 0000000..390d546 --- /dev/null +++ b/lid_driven_cavity/cuda/ldc.cuda.AA.mako @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "kernel.h" + +void write_moments_to_vtk(const std::string& path, ${float_type}* rho, ${float_type}* u) { + std::ofstream fout; + fout.open(path.c_str()); + + fout << "# vtk DataFile Version 3.0\n"; + fout << "lbm_output\n"; + fout << "ASCII\n"; + fout << "DATASET RECTILINEAR_GRID\n"; +% if descriptor.d == 2: + fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " 1" << "\n"; +% else: + fout << "DIMENSIONS " << ${geometry.size_x-2} << " " << ${geometry.size_y-2} << " " << ${geometry.size_z-2} << "\n"; +% endif + + fout << "X_COORDINATES " << ${geometry.size_x-2} << " float\n"; + for( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + fout << x << " "; + } + + fout << "\nY_COORDINATES " << ${geometry.size_y-2} << " float\n"; + for( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + fout << y << " "; + } + +% if descriptor.d == 2: + fout << "\nZ_COORDINATES " << 1 << " float\n"; + fout << 0 << "\n"; + fout << "POINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2)} << "\n"; +% else: + fout << "\nZ_COORDINATES " << ${geometry.size_z-2} << " float\n"; + for( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) { + fout << z << " "; + } + fout << "\nPOINT_DATA " << ${(geometry.size_x-2) * (geometry.size_y-2) * (geometry.size_z-2)} << "\n"; +% endif + + fout << "VECTORS velocity float\n"; +% if descriptor.d == 2: + for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + const std::size_t iCell = ${index.gid('x','y')}; + fout << u[iCell*${descriptor.d}+0] << " " << u[iCell*${descriptor.d}+1] << " 0\n"; + } + } +% else: + for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) { + for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + const std::size_t iCell = ${index.gid('x','y','z')}; + fout << u[iCell*${descriptor.d}+0] << " " << u[iCell*${descriptor.d}+1] << " " << u[iCell*${descriptor.d}+2] << "\n"; + } + } + } +% endif + + fout << "SCALARS density float 1\n"; + fout << "LOOKUP_TABLE default\n"; +% if descriptor.d == 2: + for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + const std::size_t iCell = ${index.gid('x','y')}; + fout << rho[iCell] << "\n"; + } + } +% else: + for ( std::size_t z = 1; z < ${geometry.size_z-1}; ++z ) { + for ( std::size_t y = 1; y < ${geometry.size_y-1}; ++y ) { + for ( std::size_t x = 1; x < ${geometry.size_x-1}; ++x ) { + const std::size_t iCell = ${index.gid('x','y','z')}; + fout << rho[iCell] << "\n"; + } + } + } +% endif + + fout.close(); +} + +void simulate(std::size_t nStep) +{ + ${float_type}* f; + cudaMalloc(&f, ${geometry.volume*descriptor.q}*sizeof(${float_type})); + + ${float_type}* device_moments_rho; + cudaMalloc(&device_moments_rho, ${geometry.volume} * sizeof(${float_type})); + ${float_type}* device_moments_u; + cudaMalloc(&device_moments_u, ${geometry.volume*descriptor.d} * sizeof(${float_type})); + + std::vector<${float_type}> moments_rho(${geometry.volume}); + std::vector<${float_type}> moments_u(${geometry.volume*descriptor.d}); + + std::vector ghost; + std::vector bulk; + std::vector lid_bc; + std::vector box_bc; + + for (int iX = 0; iX < ${geometry.size_x}; ++iX) { + for (int iY = 0; iY < ${geometry.size_y}; ++iY) { +% if descriptor.d == 2: + const std::size_t iCell = ${index.gid('iX','iY')}; + if (iX == 0 || iY == 0 || iX == ${geometry.size_x-1} || iY == ${geometry.size_y-1}) { + ghost.emplace_back(iCell); + } else if (iY == ${geometry.size_y-2}) { + lid_bc.emplace_back(iCell); + } else if (iX == 1 || iX == ${geometry.size_x-2} || iY == 1) { + box_bc.emplace_back(iCell); + } else { + bulk.emplace_back(iCell); + } +% elif descriptor.d == 3: + for (int iZ = 0; iZ < ${geometry.size_z}; ++iZ) { + const std::size_t iCell = ${index.gid('iX','iY','iZ')}; + if ( iX == 0 || iY == 0 || iZ == 0 + || iX == ${geometry.size_x-1} + || iY == ${geometry.size_y-1} + || iZ == ${geometry.size_z-1}) { + ghost.emplace_back(iCell); + } else if (iZ == ${geometry.size_z-2}) { + lid_bc.emplace_back(iCell); + } else if ( iX == 1 || iX == ${geometry.size_x-2} + || iY == 1 || iY == ${geometry.size_y-2} + || iZ == 1) { + box_bc.emplace_back(iCell); + } else { + bulk.emplace_back(iCell); + } + } +% endif + } + } + + std::sort(ghost.begin(), ghost.end()); + std::sort(bulk.begin(), bulk.end()); + std::sort(lid_bc.begin(), lid_bc.end()); + std::sort(box_bc.begin(), box_bc.end()); + + std::cout << "#ghost : " << ghost.size() << std::endl; + std::cout << "#bulk : " << bulk.size() << std::endl; + std::cout << "#lid : " << lid_bc.size() << std::endl; + std::cout << "#wall : " << box_bc.size() << std::endl; + std::cout << std::endl; + + std::size_t* device_ghost_cells; + std::size_t* device_bulk_cells; + std::size_t* device_lid_bc_cells; + std::size_t* device_box_bc_cells; + + cudaMalloc(&device_ghost_cells, ghost.size() * sizeof(std::size_t)); + cudaMalloc(&device_bulk_cells, bulk.size() * sizeof(std::size_t)); + cudaMalloc(&device_lid_bc_cells, lid_bc.size() * sizeof(std::size_t)); + cudaMalloc(&device_box_bc_cells, box_bc.size() * sizeof(std::size_t)); + + cudaMemcpy(device_ghost_cells, ghost.data(), ghost.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(device_bulk_cells, bulk.data(), bulk.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(device_lid_bc_cells, lid_bc.data(), lid_bc.size()* sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(device_box_bc_cells, box_bc.data(), box_bc.size()* sizeof(std::size_t), cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + + const std::size_t block_size = 32; + std::size_t block_count = 0; + + block_count = (ghost.size() + block_size - 1) / block_size; + equilibrilize_tick<<>>(f, device_ghost_cells, ghost.size()); + + block_count = (bulk.size() + block_size - 1) / block_size; + equilibrilize_tick<<>>(f, device_bulk_cells, bulk.size()); + + block_count = (box_bc.size() + block_size - 1) / block_size; + equilibrilize_tick<<>>(f, device_box_bc_cells, box_bc.size()); + + block_count = (lid_bc.size() + block_size - 1) / block_size; + equilibrilize_tick<<>>(f, device_lid_bc_cells, lid_bc.size()); + + cudaDeviceSynchronize(); + + bool tick = false; + + auto start = std::chrono::high_resolution_clock::now(); + + for (std::size_t iStep = 1; iStep <= nStep; ++iStep) { + tick = !tick; + + if (!tick) { + block_count = (ghost.size() + block_size - 1) / block_size; + equilibrilize_tick<<>>(f, device_ghost_cells, ghost.size()); + } + + block_count = (bulk.size() + block_size - 1) / block_size; + tick ? collide_and_stream_tick<<>>(f, device_bulk_cells, bulk.size()) + : collide_and_stream_tock<<>>(f, device_bulk_cells, bulk.size()); + + block_count = (box_bc.size() + block_size - 1) / block_size; +% if descriptor.d == 2: + tick ? velocity_momenta_boundary_tick<<>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0) + : velocity_momenta_boundary_tock<<>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0); +% else: + tick ? velocity_momenta_boundary_tick<<>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0, 0.0) + : velocity_momenta_boundary_tock<<>>(f, device_box_bc_cells, box_bc.size(), 0.0, 0.0, 0.0); +% endif + + block_count = (lid_bc.size() + block_size - 1) / block_size; +% if descriptor.d == 2: + tick ? velocity_momenta_boundary_tick<<>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0) + : velocity_momenta_boundary_tock<<>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0); +% else: + tick ? velocity_momenta_boundary_tick<<>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0, 0.0) + : velocity_momenta_boundary_tock<<>>(f, device_lid_bc_cells, lid_bc.size(), 0.05, 0.0, 0.0); +% endif + + if (iStep % 1000 == 0) { + cudaDeviceSynchronize(); + + auto duration = std::chrono::duration_cast>( + std::chrono::high_resolution_clock::now() - start); + std::cout << "iStep = " << iStep << "; ~" << 1000*${geometry.volume}l/(1e6*duration.count()) << " MLUPS" << std::endl; + + block_count = (bulk.size() + block_size - 1) / block_size; + collect_moments_tick<<>>(f, device_bulk_cells, bulk.size(), device_moments_rho, device_moments_u); + cudaMemcpy(moments_rho.data(), device_moments_rho, ${geometry.volume}*sizeof(${float_type}), cudaMemcpyDeviceToHost); + cudaMemcpy(moments_u.data(), device_moments_u, ${geometry.volume*descriptor.d}*sizeof(${float_type}), cudaMemcpyDeviceToHost); + write_moments_to_vtk("result/ldc_" + std::to_string(iStep) + ".vtk", moments_rho.data(), moments_u.data()); + + start = std::chrono::high_resolution_clock::now(); + } + } + + cudaFree(device_ghost_cells); + cudaFree(device_bulk_cells); + cudaFree(device_lid_bc_cells); + cudaFree(device_box_bc_cells); + cudaFree(device_moments_rho); + cudaFree(device_moments_u); + cudaFree(f); +} + +int main() { + simulate(20000); +} -- cgit v1.2.3