dtfft_kernel_device Module

This module defines kernel_device type and its type bound procedures. It extends abstract_kernel type and implements its type bound procedures.


Uses

  • module~~dtfft_kernel_device~~UsesGraph module~dtfft_kernel_device dtfft_kernel_device iso_c_binding iso_c_binding module~dtfft_kernel_device->iso_c_binding iso_fortran_env iso_fortran_env module~dtfft_kernel_device->iso_fortran_env module~dtfft_abstract_kernel dtfft_abstract_kernel module~dtfft_kernel_device->module~dtfft_abstract_kernel module~dtfft_config dtfft_config module~dtfft_kernel_device->module~dtfft_config module~dtfft_interface_cuda dtfft_interface_cuda module~dtfft_kernel_device->module~dtfft_interface_cuda module~dtfft_interface_cuda_runtime dtfft_interface_cuda_runtime module~dtfft_kernel_device->module~dtfft_interface_cuda_runtime module~dtfft_interface_nvtx dtfft_interface_nvtx module~dtfft_kernel_device->module~dtfft_interface_nvtx module~dtfft_nvrtc_block_optimizer dtfft_nvrtc_block_optimizer module~dtfft_kernel_device->module~dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_module_cache dtfft_nvrtc_module_cache module~dtfft_kernel_device->module~dtfft_nvrtc_module_cache module~dtfft_parameters dtfft_parameters module~dtfft_kernel_device->module~dtfft_parameters module~dtfft_utils dtfft_utils module~dtfft_kernel_device->module~dtfft_utils mpi_f08 mpi_f08 module~dtfft_kernel_device->mpi_f08 module~dtfft_abstract_kernel->iso_fortran_env module~dtfft_abstract_kernel->module~dtfft_interface_nvtx module~dtfft_abstract_kernel->module~dtfft_parameters module~dtfft_abstract_kernel->module~dtfft_utils module~dtfft_abstract_kernel->mpi_f08 module~dtfft_config->iso_c_binding module~dtfft_config->iso_fortran_env module~dtfft_config->module~dtfft_interface_cuda_runtime module~dtfft_config->module~dtfft_parameters module~dtfft_config->module~dtfft_utils module~dtfft_config->mpi_f08 module~dtfft_errors dtfft_errors module~dtfft_config->module~dtfft_errors module~dtfft_interface_cuda->iso_c_binding module~dtfft_interface_cuda->iso_fortran_env module~dtfft_interface_cuda->module~dtfft_parameters module~dtfft_interface_cuda->module~dtfft_utils module~dtfft_interface_cuda->module~dtfft_errors module~dtfft_interface_cuda_runtime->iso_c_binding module~dtfft_interface_cuda_runtime->module~dtfft_parameters module~dtfft_interface_cuda_runtime->module~dtfft_utils module~dtfft_interface_nvtx->iso_c_binding module~dtfft_interface_nvtx->module~dtfft_utils module~dtfft_nvrtc_block_optimizer->iso_fortran_env module~dtfft_nvrtc_block_optimizer->module~dtfft_abstract_kernel module~dtfft_nvrtc_block_optimizer->module~dtfft_config module~dtfft_nvrtc_block_optimizer->module~dtfft_interface_cuda module~dtfft_nvrtc_block_optimizer->module~dtfft_interface_cuda_runtime module~dtfft_nvrtc_block_optimizer->module~dtfft_parameters module~dtfft_nvrtc_block_optimizer->module~dtfft_utils module~dtfft_nvrtc_module_cache->iso_c_binding module~dtfft_nvrtc_module_cache->iso_fortran_env module~dtfft_nvrtc_module_cache->module~dtfft_abstract_kernel module~dtfft_nvrtc_module_cache->module~dtfft_config module~dtfft_nvrtc_module_cache->module~dtfft_interface_cuda module~dtfft_nvrtc_module_cache->module~dtfft_interface_cuda_runtime module~dtfft_nvrtc_module_cache->module~dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_module_cache->module~dtfft_utils module~dtfft_nvrtc_module dtfft_nvrtc_module module~dtfft_nvrtc_module_cache->module~dtfft_nvrtc_module module~dtfft_parameters->iso_c_binding module~dtfft_parameters->iso_fortran_env module~dtfft_parameters->mpi_f08 module~dtfft_utils->iso_c_binding module~dtfft_utils->iso_fortran_env module~dtfft_utils->module~dtfft_parameters module~dtfft_utils->mpi_f08 module~dtfft_utils->module~dtfft_errors module~dtfft_errors->iso_fortran_env module~dtfft_nvrtc_module->iso_c_binding module~dtfft_nvrtc_module->iso_fortran_env module~dtfft_nvrtc_module->module~dtfft_abstract_kernel module~dtfft_nvrtc_module->module~dtfft_config module~dtfft_nvrtc_module->module~dtfft_interface_cuda module~dtfft_nvrtc_module->module~dtfft_interface_cuda_runtime module~dtfft_nvrtc_module->module~dtfft_interface_nvtx module~dtfft_nvrtc_module->module~dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_module->module~dtfft_parameters module~dtfft_nvrtc_module->module~dtfft_utils module~dtfft_nvrtc_module->mpi_f08 module~dtfft_interface_nvrtc dtfft_interface_nvrtc module~dtfft_nvrtc_module->module~dtfft_interface_nvrtc module~dtfft_interface_nvrtc->iso_c_binding module~dtfft_interface_nvrtc->iso_fortran_env module~dtfft_interface_nvrtc->module~dtfft_utils module~dtfft_interface_nvrtc->module~dtfft_errors

Used by

  • module~~dtfft_kernel_device~~UsedByGraph module~dtfft_kernel_device dtfft_kernel_device module~dtfft_transpose_handle_generic dtfft_transpose_handle_generic module~dtfft_transpose_handle_generic->module~dtfft_kernel_device module~dtfft_transpose_plan dtfft_transpose_plan module~dtfft_transpose_plan->module~dtfft_kernel_device module~dtfft_transpose_plan->module~dtfft_transpose_handle_generic module~dtfft_plan dtfft_plan module~dtfft_plan->module~dtfft_transpose_plan module~dtfft dtfft module~dtfft->module~dtfft_plan module~dtfft_api dtfft_api module~dtfft_api->module~dtfft_plan

Variables

Type Visibility Attributes Name Initial
integer(kind=int32), public, parameter :: DEF_TILE_SIZE = 32

Default tile size


Derived Types

type, public, extends(abstract_kernel) ::  kernel_device

Device kernel class

Components

Type Visibility Attributes Name Initial
logical, public :: is_created = .false.

Kernel is created flag.

logical, public :: is_dummy = .false.

If kernel should do anything or not.

type(kernel_type_t), public :: kernel_type

Type of the kernel

character(len=:), public, allocatable :: kernel_string
integer(kind=int32), public, allocatable :: neighbor_data(:,:)

Neighbor data for pipelined unpacking

integer(kind=int32), public, allocatable :: dims(:)

Local dimensions to process

type(kernel_type_t), private :: internal_kernel_type

Actual kernel type used for execution, can be different from kernel_type

type(CUfunction), private :: cuda_kernel

Pointer to CUDA kernel.

integer(kind=int32), private :: tile_size

Tile size used for this kernel

integer(kind=int32), private :: block_rows

Number of rows in each block processed by each thread

integer(kind=int64), private :: copy_bytes

Number of bytes to copy for KERNEL_UNPACK_SIMPLE_COPY kernel

Type-Bound Procedures

procedure, public, pass(self) :: create ../../

Creates kernel

procedure, public, pass(self) :: execute ../../

Executes kernel

procedure, public, pass(self) :: destroy ../../

Destroys kernel

procedure, public :: create_private => create ../../

Creates kernel

procedure, public :: execute_private => execute ../../

Executes kernel

procedure, public :: destroy_private => destroy ../../

Destroys kernel


Subroutines

private subroutine create(self, effort, base_storage, force_effort)

Creates kernel

Arguments

Type IntentOptional Attributes Name
class(kernel_device), intent(inout) :: self

Device kernel class

type(dtfft_effort_t), intent(in) :: effort

Effort level for generating transpose kernels

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

logical, intent(in), optional :: force_effort

Should effort be forced or not

private subroutine execute(self, in, out, stream, neighbor)

Executes kernel on stream

Arguments

Type IntentOptional Attributes Name
class(kernel_device), intent(inout) :: self

Device kernel class

real(kind=real32), intent(in), target :: in(:)

Device pointer

real(kind=real32), intent(inout), target :: out(:)

Device pointer

type(dtfft_stream_t), intent(in) :: stream

Stream to execute on

integer(kind=int32), intent(in), optional :: neighbor

Source rank for pipelined unpacking

private subroutine destroy(self)

Destroys kernel

Arguments

Type IntentOptional Attributes Name
class(kernel_device), intent(inout) :: self

Device kernel class

private subroutine get_kernel_args(kernel_type, dims, nargs, args, neighbor_data)

Populates kernel arguments based on kernel type

Arguments

Type IntentOptional Attributes Name
type(kernel_type_t), intent(in) :: kernel_type

Type of kernel

integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

integer(kind=int32), intent(out) :: nargs

Number of arguments set by this subroutine

integer(kind=int32), intent(out) :: args(MAX_KERNEL_ARGS)

Kernel arguments

integer(kind=int32), intent(in), optional :: neighbor_data(:)

Neighbor data for pipelined kernels

private subroutine get_kernel_launch_params(kernel_type, dims, tile_size, block_rows, blocks, threads)

Computes kernel launch parameters based on kernel type and dimensions

Arguments

Type IntentOptional Attributes Name
type(kernel_type_t), intent(in) :: kernel_type

Type of kernel

integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

integer(kind=int32), intent(in) :: tile_size

Size of the tile in shared memory

integer(kind=int32), intent(in) :: block_rows

Number of rows in each block

type(dim3), intent(out) :: blocks

Number of blocks to launch

type(dim3), intent(out) :: threads

Number of threads per block

private subroutine get_kernel(dims, kernel_type, effort, base_storage, props, tile_size, block_rows, kernel, force_effort, neighbor_data)

Compiles kernel and caches it. Returns compiled kernel.

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

type(kernel_type_t), intent(in) :: kernel_type

Type of kernel to build

type(dtfft_effort_t), intent(in) :: effort

How thoroughly dtFFT searches for the optimal transpose kernel

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(device_props), intent(in) :: props

GPU architecture properties

integer(kind=int32), intent(out) :: tile_size

Size of the tile in shared memory

integer(kind=int32), intent(out) :: block_rows

Number of rows in each block processed by each thread

type(CUfunction), intent(out) :: kernel

Compiled kernel to return

logical, intent(in), optional :: force_effort

Should effort be forced or not

integer(kind=int32), intent(in), optional :: neighbor_data(:)

Neighbor data for pipelined kernels