dtfft_nvrtc_block_optimizer Module


Uses

  • module~~dtfft_nvrtc_block_optimizer~~UsesGraph module~dtfft_nvrtc_block_optimizer dtfft_nvrtc_block_optimizer iso_fortran_env iso_fortran_env module~dtfft_nvrtc_block_optimizer->iso_fortran_env module~dtfft_config dtfft_config module~dtfft_nvrtc_block_optimizer->module~dtfft_config module~dtfft_interface_cuda dtfft_interface_cuda module~dtfft_nvrtc_block_optimizer->module~dtfft_interface_cuda module~dtfft_interface_cuda_runtime dtfft_interface_cuda_runtime module~dtfft_nvrtc_block_optimizer->module~dtfft_interface_cuda_runtime module~dtfft_parameters dtfft_parameters module~dtfft_nvrtc_block_optimizer->module~dtfft_parameters module~dtfft_utils dtfft_utils module~dtfft_nvrtc_block_optimizer->module~dtfft_utils module~dtfft_config->iso_fortran_env module~dtfft_config->module~dtfft_interface_cuda_runtime module~dtfft_config->module~dtfft_parameters module~dtfft_config->module~dtfft_utils iso_c_binding iso_c_binding module~dtfft_config->iso_c_binding module~dtfft_errors dtfft_errors module~dtfft_config->module~dtfft_errors mpi_f08 mpi_f08 module~dtfft_config->mpi_f08 module~dtfft_interface_cuda->iso_fortran_env module~dtfft_interface_cuda->module~dtfft_parameters module~dtfft_interface_cuda->module~dtfft_utils module~dtfft_interface_cuda->iso_c_binding module~dtfft_interface_cuda->module~dtfft_errors module~dtfft_interface_cuda_runtime->module~dtfft_parameters module~dtfft_interface_cuda_runtime->module~dtfft_utils module~dtfft_interface_cuda_runtime->iso_c_binding module~dtfft_parameters->iso_fortran_env module~dtfft_parameters->iso_c_binding module~dtfft_parameters->mpi_f08 module~dtfft_utils->iso_fortran_env module~dtfft_utils->module~dtfft_parameters module~dtfft_utils->iso_c_binding module~dtfft_utils->module~dtfft_errors module~dtfft_utils->mpi_f08 module~dtfft_errors->iso_fortran_env

Used by

  • module~~dtfft_nvrtc_block_optimizer~~UsedByGraph module~dtfft_nvrtc_block_optimizer dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_kernel dtfft_nvrtc_kernel module~dtfft_nvrtc_kernel->module~dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_kernel_generator dtfft_nvrtc_kernel_generator module~dtfft_nvrtc_kernel->module~dtfft_nvrtc_kernel_generator module~dtfft_nvrtc_kernel_generator->module~dtfft_nvrtc_block_optimizer module~dtfft_abstract_backend dtfft_abstract_backend module~dtfft_abstract_backend->module~dtfft_nvrtc_kernel module~dtfft_abstract_transpose_plan dtfft_abstract_transpose_plan module~dtfft_abstract_transpose_plan->module~dtfft_nvrtc_kernel module~dtfft_abstract_transpose_plan->module~dtfft_abstract_backend module~dtfft_transpose_handle_cuda dtfft_transpose_handle_cuda module~dtfft_transpose_handle_cuda->module~dtfft_nvrtc_kernel module~dtfft_transpose_handle_cuda->module~dtfft_abstract_backend module~dtfft_backend_cufftmp_m dtfft_backend_cufftmp_m module~dtfft_transpose_handle_cuda->module~dtfft_backend_cufftmp_m module~dtfft_backend_mpi dtfft_backend_mpi module~dtfft_transpose_handle_cuda->module~dtfft_backend_mpi module~dtfft_backend_nccl_m dtfft_backend_nccl_m module~dtfft_transpose_handle_cuda->module~dtfft_backend_nccl_m module~dtfft_backend_cufftmp_m->module~dtfft_abstract_backend module~dtfft_backend_mpi->module~dtfft_abstract_backend module~dtfft_backend_nccl_m->module~dtfft_abstract_backend module~dtfft_plan dtfft_plan module~dtfft_plan->module~dtfft_abstract_transpose_plan module~dtfft_transpose_plan_cuda dtfft_transpose_plan_cuda module~dtfft_plan->module~dtfft_transpose_plan_cuda module~dtfft_transpose_plan_host dtfft_transpose_plan_host module~dtfft_plan->module~dtfft_transpose_plan_host module~dtfft_transpose_plan_cuda->module~dtfft_abstract_backend module~dtfft_transpose_plan_cuda->module~dtfft_abstract_transpose_plan module~dtfft_transpose_plan_cuda->module~dtfft_transpose_handle_cuda module~dtfft_transpose_plan_host->module~dtfft_abstract_transpose_plan module~dtfft dtfft module~dtfft->module~dtfft_plan module~dtfft_api dtfft_api module~dtfft_api->module~dtfft_plan

Variables

Type Visibility Attributes Name Initial
integer(kind=int32), public, parameter :: N_TILES_CANDIDATES = 5

Maximum number of tile candidates to generate

integer(kind=int32), public, parameter :: N_BLOCKS_CANDIDATES = 5

Maximum number of block candidates to generate

integer(kind=int32), public, parameter :: N_CANDIDATES = N_TILES_CANDIDATES*N_BLOCKS_CANDIDATES

Maximum number of candidates to generate

integer(kind=int32), private, parameter :: NUM_BANKS = 32

Number of banks in shared memory

integer(kind=int32), private, parameter :: WARP_SIZE = 32

Warp size in threads

integer(kind=int32), private, parameter :: BANK_WIDTH_BYTES = 4

Bank width in bytes


Derived Types

type, public ::  kernel_config

Configuration for the kernel launch

Components

Type Visibility Attributes Name Initial
type(dim3), public :: blocks

Number of blocks in the grid

type(dim3), public :: threads

Number of threads per block

integer(kind=int32), public :: padding

Padding added to the tile


Functions

public function get_ampere_architecture() result(props)

Ampere architecture (Compute Capability 8.0)

Arguments

None

Return Value type(device_props)

public function get_volta_architecture() result(props)

Volta architecture (Compute Capability 7.0)

Arguments

None

Return Value type(device_props)

public pure function count_bank_conflicts(tile_size, block_rows, base_storage, padding) result(total_conflicts)

Counts bank conflicts for a given tile size, padding, element size, and block rows.

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: tile_size

Size of the tile

integer(kind=int32), intent(in) :: block_rows

Number of rows in the block

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

integer(kind=int32), intent(in) :: padding

Padding added to the tile

Return Value integer(kind=int32)

Total number of bank conflicts

public function evaluate_analytical_performance(dims, transpose_type, config, props, base_storage) result(score)

This function evaluates the performance of a kernel configuration based on various architectural and problem-specific parameters.

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: dims(:)

Problem dimensions

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

type(kernel_config), intent(in) :: config

Kernel configuration

type(device_props), intent(in) :: props

GPU architecture properties

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

Return Value real(kind=real32)

Performance score

private pure function estimate_optimal_padding(tile_size, block_rows, base_storage) result(padding)

Estimates the optimal padding for a given tile size and element size

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: tile_size

Size of the tile

integer(kind=int32), intent(in) :: block_rows

Number of rows in the block

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

Return Value integer(kind=int32)

Optimal padding to reduce bank conflicts

private pure function estimate_bank_conflict_ratio(config, base_storage) result(ratio)

Estimates the bank conflict ratio for a given kernel configuration

Arguments

Type IntentOptional Attributes Name
type(kernel_config), intent(in) :: config

Kernel configuration

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

Return Value real(kind=real32)

Bank conflict estimation

private pure function estimate_occupancy(config, props, base_storage) result(occupancy)

Calculates theoretical occupancy for a given kernel configuration

Arguments

Type IntentOptional Attributes Name
type(kernel_config), intent(in) :: config

Kernel configuration

type(device_props), intent(in) :: props

GPU architecture properties

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

Return Value real(kind=real32)

Estimated occupancy

private function estimate_memory_pressure(dims, tile_dim, other_dim, base_storage, props) result(pressure)

Analytical estimation of memory pressure based on GPU architecture

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: dims(:)

Size of the problem

integer(kind=int32), intent(in) :: tile_dim

Tile dimension

integer(kind=int32), intent(in) :: other_dim

Other dimension (not tiled)

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(device_props), intent(in) :: props

GPU architecture properties

Return Value real(kind=real32)

Pressure metric

private function estimate_coalescing(dims, transpose_type, config, base_storage) result(score)

Estimate memory coalescing efficiency for a given kernel configuration and transpose type

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: dims(:)

Local dimensions of the input data

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transpose operation

type(kernel_config), intent(in) :: config

Kernel configuration

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

Return Value real(kind=real32)

Coalescing score


Subroutines

public subroutine generate_candidates(dims, tile_dim, other_dim, base_storage, props, candidates, num_candidates)

Generate kernel configuration candidates for given problem

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: dims(:)

Local dimensions of the input data, always 3D

integer(kind=int32), intent(in) :: tile_dim

Tile dimension

integer(kind=int32), intent(in) :: other_dim

Other dimension (not tiled)

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(device_props), intent(in) :: props

GPU architecture properties

type(kernel_config), intent(out) :: candidates(:)

Generated kernel configurations

integer(kind=int32), intent(out) :: num_candidates

Number of generated candidates

public subroutine sort_candidates_by_score(scores, num_candidates, sorted_indices)

Sorting candidates by their performance scores

Arguments

Type IntentOptional Attributes Name
real(kind=real32), intent(in) :: scores(:)

Performance scores of candidates generated by evaluate_analytical_performance

integer(kind=int32), intent(in) :: num_candidates

Number of candidates

integer(kind=int32), intent(out) :: sorted_indices(:)

Sorted indices of candidates

private subroutine find_valid_combination(base_tile, base_rows)

This subroutine optimizes the tile size and number of rows for narrow matrices by adjusting them to be compatible with the warp size.

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(inout) :: base_tile

< Tile size

integer(kind=int32), intent(inout) :: base_rows

< Number of rows