dtfft_nvrtc_kernel Module

This module describes NVRTC Kernel class nvrtc_kernel


Uses

  • module~~dtfft_nvrtc_kernel~~UsesGraph module~dtfft_nvrtc_kernel dtfft_nvrtc_kernel iso_c_binding iso_c_binding module~dtfft_nvrtc_kernel->iso_c_binding iso_fortran_env iso_fortran_env module~dtfft_nvrtc_kernel->iso_fortran_env module~dtfft_config dtfft_config module~dtfft_nvrtc_kernel->module~dtfft_config module~dtfft_interface_cuda dtfft_interface_cuda module~dtfft_nvrtc_kernel->module~dtfft_interface_cuda module~dtfft_interface_cuda_runtime dtfft_interface_cuda_runtime module~dtfft_nvrtc_kernel->module~dtfft_interface_cuda_runtime module~dtfft_interface_nvrtc dtfft_interface_nvrtc module~dtfft_nvrtc_kernel->module~dtfft_interface_nvrtc module~dtfft_interface_nvtx dtfft_interface_nvtx module~dtfft_nvrtc_kernel->module~dtfft_interface_nvtx module~dtfft_nvrtc_block_optimizer dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_kernel->module~dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_kernel_cache dtfft_nvrtc_kernel_cache module~dtfft_nvrtc_kernel->module~dtfft_nvrtc_kernel_cache module~dtfft_nvrtc_kernel_generator dtfft_nvrtc_kernel_generator module~dtfft_nvrtc_kernel->module~dtfft_nvrtc_kernel_generator module~dtfft_parameters dtfft_parameters module~dtfft_nvrtc_kernel->module~dtfft_parameters module~dtfft_utils dtfft_utils module~dtfft_nvrtc_kernel->module~dtfft_utils mpi_f08 mpi_f08 module~dtfft_nvrtc_kernel->mpi_f08 module~dtfft_config->iso_c_binding module~dtfft_config->iso_fortran_env module~dtfft_config->module~dtfft_interface_cuda_runtime module~dtfft_config->module~dtfft_parameters module~dtfft_config->module~dtfft_utils module~dtfft_config->mpi_f08 module~dtfft_errors dtfft_errors module~dtfft_config->module~dtfft_errors module~dtfft_interface_cuda->iso_c_binding module~dtfft_interface_cuda->iso_fortran_env module~dtfft_interface_cuda->module~dtfft_parameters module~dtfft_interface_cuda->module~dtfft_utils module~dtfft_interface_cuda->module~dtfft_errors module~dtfft_interface_cuda_runtime->iso_c_binding module~dtfft_interface_cuda_runtime->module~dtfft_parameters module~dtfft_interface_cuda_runtime->module~dtfft_utils module~dtfft_interface_nvrtc->iso_c_binding module~dtfft_interface_nvrtc->iso_fortran_env module~dtfft_interface_nvrtc->module~dtfft_utils module~dtfft_interface_nvrtc->module~dtfft_errors module~dtfft_interface_nvtx->iso_c_binding module~dtfft_interface_nvtx->module~dtfft_utils module~dtfft_nvrtc_block_optimizer->iso_fortran_env module~dtfft_nvrtc_block_optimizer->module~dtfft_config module~dtfft_nvrtc_block_optimizer->module~dtfft_interface_cuda module~dtfft_nvrtc_block_optimizer->module~dtfft_interface_cuda_runtime module~dtfft_nvrtc_block_optimizer->module~dtfft_parameters module~dtfft_nvrtc_block_optimizer->module~dtfft_utils module~dtfft_nvrtc_kernel_cache->iso_c_binding module~dtfft_nvrtc_kernel_cache->iso_fortran_env module~dtfft_nvrtc_kernel_cache->module~dtfft_config module~dtfft_nvrtc_kernel_cache->module~dtfft_interface_cuda module~dtfft_nvrtc_kernel_cache->module~dtfft_interface_cuda_runtime module~dtfft_nvrtc_kernel_cache->module~dtfft_parameters module~dtfft_nvrtc_kernel_cache->module~dtfft_utils module~dtfft_nvrtc_kernel_cache->mpi_f08 module~dtfft_nvrtc_kernel_generator->iso_c_binding module~dtfft_nvrtc_kernel_generator->iso_fortran_env module~dtfft_nvrtc_kernel_generator->module~dtfft_interface_cuda module~dtfft_nvrtc_kernel_generator->module~dtfft_interface_cuda_runtime module~dtfft_nvrtc_kernel_generator->module~dtfft_interface_nvrtc module~dtfft_nvrtc_kernel_generator->module~dtfft_nvrtc_block_optimizer module~dtfft_nvrtc_kernel_generator->module~dtfft_parameters module~dtfft_nvrtc_kernel_generator->module~dtfft_utils module~dtfft_nvrtc_kernel_generator->mpi_f08 module~dtfft_parameters->iso_c_binding module~dtfft_parameters->iso_fortran_env module~dtfft_parameters->mpi_f08 module~dtfft_utils->iso_c_binding module~dtfft_utils->iso_fortran_env module~dtfft_utils->module~dtfft_parameters module~dtfft_utils->mpi_f08 module~dtfft_utils->module~dtfft_errors module~dtfft_errors->iso_fortran_env

Used by

  • module~~dtfft_nvrtc_kernel~~UsedByGraph module~dtfft_nvrtc_kernel dtfft_nvrtc_kernel module~dtfft_abstract_backend dtfft_abstract_backend module~dtfft_abstract_backend->module~dtfft_nvrtc_kernel module~dtfft_abstract_transpose_plan dtfft_abstract_transpose_plan module~dtfft_abstract_transpose_plan->module~dtfft_nvrtc_kernel module~dtfft_abstract_transpose_plan->module~dtfft_abstract_backend module~dtfft_transpose_handle_cuda dtfft_transpose_handle_cuda module~dtfft_transpose_handle_cuda->module~dtfft_nvrtc_kernel module~dtfft_transpose_handle_cuda->module~dtfft_abstract_backend module~dtfft_backend_cufftmp_m dtfft_backend_cufftmp_m module~dtfft_transpose_handle_cuda->module~dtfft_backend_cufftmp_m module~dtfft_backend_mpi dtfft_backend_mpi module~dtfft_transpose_handle_cuda->module~dtfft_backend_mpi module~dtfft_backend_nccl_m dtfft_backend_nccl_m module~dtfft_transpose_handle_cuda->module~dtfft_backend_nccl_m module~dtfft_backend_cufftmp_m->module~dtfft_abstract_backend module~dtfft_backend_mpi->module~dtfft_abstract_backend module~dtfft_backend_nccl_m->module~dtfft_abstract_backend module~dtfft_plan dtfft_plan module~dtfft_plan->module~dtfft_abstract_transpose_plan module~dtfft_transpose_plan_cuda dtfft_transpose_plan_cuda module~dtfft_plan->module~dtfft_transpose_plan_cuda module~dtfft_transpose_plan_host dtfft_transpose_plan_host module~dtfft_plan->module~dtfft_transpose_plan_host module~dtfft_transpose_plan_cuda->module~dtfft_abstract_backend module~dtfft_transpose_plan_cuda->module~dtfft_abstract_transpose_plan module~dtfft_transpose_plan_cuda->module~dtfft_transpose_handle_cuda module~dtfft_transpose_plan_host->module~dtfft_abstract_transpose_plan module~dtfft dtfft module~dtfft->module~dtfft_plan module~dtfft_api dtfft_api module~dtfft_api->module~dtfft_plan

Variables

Type Visibility Attributes Name Initial
integer(kind=int32), public, parameter :: DEF_TILE_SIZE = 32

Default tile size

character(len=*), private, parameter :: DEFAULT_KERNEL_NAME = "dtfft_kernel"

Basic kernel name

integer(kind=int32), private, parameter :: TARGET_THREADS_PER_BLOCK = 256

Target number of threads per block for unpacked kernels


Derived Types

type, public ::  nvrtc_kernel

nvRTC Compiled kernel class

Components

Type Visibility Attributes Name Initial
logical, private :: is_created = .false.

Kernel is created flag.

logical, private :: is_dummy = .false.

If kernel should do anything or not.

type(CUfunction), private :: cuda_kernel

Pointer to CUDA kernel.

type(dim3), private :: blocks

Grid of blocks.

type(dim3), private :: threads

Thread block.

type(kernel_type_t), private :: kernel_type

Type of kernel to execute.

type(kernelArgs), private :: kernelParams

Kernel arguments.

integer(kind=int32), private, allocatable :: pointers(:,:)

Optional pointers that hold info about counts and displacements in KERNEL_UNPACK_PIPELINED kernel.

type(c_ptr), private :: device_pointers(3)

Device pointers for kernel arguments.

logical, private :: has_device_pointers

Flag indicating if device pointers are present

integer(kind=int64), private :: copy_bytes

Number of bytes to copy for KERNEL_UNPACK_SIMPLE_COPY kernel

Type-Bound Procedures

procedure, public, pass(self) :: create ../../

Creates kernel

procedure, public, pass(self) :: execute ../../

Executes kernel

procedure, public, pass(self) :: destroy ../../

Destroys kernel


Functions

private function compile_and_cache(comm, kernel_name, kernel_type, transpose_type, code, props, base_storage, tile_size, padding) result(kernel)

Compiles kernel stored in code and caches pointer to CUfunction

Arguments

Type IntentOptional Attributes Name
type(MPI_Comm), intent(in) :: comm

MPI Communicator

character(len=*), intent(in) :: kernel_name

Kernel name

type(kernel_type_t), intent(in) :: kernel_type

Type of kernel to build

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

type(kernel_codegen), intent(in) :: code

Kernel code to compile

type(device_props), intent(in) :: props

GPU architecture properties

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

integer(kind=int32), intent(in) :: tile_size

Tile size to use in shared memory

integer(kind=int32), intent(in) :: padding

Padding to use in shared memory

Return Value type(CUfunction)

Compiled kernel to return


Subroutines

public subroutine get_kernel_args(comm, dims, transpose_type, kernel_type, block_rows, ptrs, params)

Populates kernel arguments based on kernel type

Arguments

Type IntentOptional Attributes Name
type(MPI_Comm), intent(in) :: comm
integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

type(kernel_type_t), intent(in) :: kernel_type

Type of kernel to build

integer(kind=int32), intent(in) :: block_rows

Number of rows in each block

type(c_ptr), intent(in) :: ptrs(3)
type(kernelArgs), intent(out) :: params

Kernel arguments

public subroutine get_kernel(comm, dims, transpose_type, kernel_type, effort, base_storage, props, ptrs, blocks, threads, kernel, force_effort)

Compiles kernel and caches it. Returns compiled kernel.

Arguments

Type IntentOptional Attributes Name
type(MPI_Comm), intent(in) :: comm

MPI Communicator

integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

type(kernel_type_t), intent(in) :: kernel_type

Type of kernel to build

type(dtfft_effort_t), intent(in) :: effort

How thoroughly dtFFT searches for the optimal transpose kernel

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(device_props), intent(in) :: props

GPU architecture properties

type(c_ptr), intent(in) :: ptrs(3)

Array of device pointers required by certain kernels

type(dim3), intent(out) :: blocks

Selected grid of blocks

type(dim3), intent(out) :: threads

Selected thread configuration

type(CUfunction), intent(out) :: kernel

Compiled kernel to return

logical, intent(in), optional :: force_effort

Should effort be forced or not

private subroutine create(self, comm, dims, effort, base_storage, transpose_type, kernel_type, pointers, force_effort)

Creates kernel

Arguments

Type IntentOptional Attributes Name
class(nvrtc_kernel), intent(inout) :: self

nvRTC Compiled kernel class

type(MPI_Comm), intent(in) :: comm

MPI Communicator

integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

type(dtfft_effort_t), intent(in) :: effort

Effort level for generating transpose kernels

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

type(kernel_type_t), intent(in) :: kernel_type

Type of kernel to build

integer(kind=int32), intent(in), optional :: pointers(:,:)

Optional pointers to unpack kernels

logical, intent(in), optional :: force_effort

Should effort be forced or not

private subroutine execute(self, in, out, stream, source)

Executes kernel on stream

Arguments

Type IntentOptional Attributes Name
class(nvrtc_kernel), intent(inout) :: self

nvRTC Compiled kernel class

real(kind=real32), intent(in), target :: in(:)

Source pointer

real(kind=real32), intent(in), target :: out(:)

Target pointer

type(dtfft_stream_t), intent(in) :: stream

CUDA Stream

integer(kind=int32), intent(in), optional :: source

Source rank for pipelined unpacking

private subroutine destroy(self)

Destroys kernel

Arguments

Type IntentOptional Attributes Name
class(nvrtc_kernel), intent(inout) :: self

nvRTC Compiled kernel class

private subroutine get_contiguous_execution_blocks(size, blocks, threads)

Gets the number of blocks and threads for a contiguous execution

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: size

Total amount of iterations required

type(dim3), intent(out) :: blocks

Grid of blocks.

type(dim3), intent(out) :: threads

Thread block.

private subroutine create_device_pointer(ptr, values)

Allocates memory on a device and copies values to it.

Arguments

Type IntentOptional Attributes Name
type(c_ptr), intent(inout) :: ptr

Device pointer

integer(kind=c_int), intent(in), target :: values(:)

Values to copy

private subroutine get_transpose_kernel(comm, kernel_name, dims, transpose_type, kernel_type, base_storage, props, config, blocks, threads, kernel)

Arguments

Type IntentOptional Attributes Name
type(MPI_Comm), intent(in) :: comm

MPI Communicator

character(len=*), intent(in) :: kernel_name
integer(kind=int32), intent(in) :: dims(:)

Local dimensions to process

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

type(kernel_type_t), intent(in) :: kernel_type

Type of kernel to build

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(device_props), intent(in) :: props
type(kernel_config), intent(in) :: config
type(dim3), intent(out) :: blocks
type(dim3), intent(out) :: threads
type(CUfunction), intent(out) :: kernel

Compiled kernel to return