dtfft_nvrtc_kernel Module

This module describes NVRTC Kernel class nvrtc_kernel It uses caching of compiled kernels to avoid recompilation similar kernels: nvrtc_cache


Uses

  • module~~dtfft_nvrtc_kernel~~UsesGraph module~dtfft_nvrtc_kernel dtfft_nvrtc_kernel iso_c_binding iso_c_binding module~dtfft_nvrtc_kernel->iso_c_binding iso_fortran_env iso_fortran_env module~dtfft_nvrtc_kernel->iso_fortran_env module~dtfft_interface_cuda dtfft_interface_cuda module~dtfft_nvrtc_kernel->module~dtfft_interface_cuda module~dtfft_interface_cuda_runtime dtfft_interface_cuda_runtime module~dtfft_nvrtc_kernel->module~dtfft_interface_cuda_runtime module~dtfft_interface_nvrtc dtfft_interface_nvrtc module~dtfft_nvrtc_kernel->module~dtfft_interface_nvrtc module~dtfft_interface_nvtx dtfft_interface_nvtx module~dtfft_nvrtc_kernel->module~dtfft_interface_nvtx module~dtfft_parameters dtfft_parameters module~dtfft_nvrtc_kernel->module~dtfft_parameters module~dtfft_utils dtfft_utils module~dtfft_nvrtc_kernel->module~dtfft_utils mpi_f08 mpi_f08 module~dtfft_nvrtc_kernel->mpi_f08 module~dtfft_interface_cuda->iso_c_binding module~dtfft_interface_cuda->iso_fortran_env module~dtfft_interface_cuda->module~dtfft_interface_cuda_runtime module~dtfft_interface_cuda->module~dtfft_parameters module~dtfft_interface_cuda->module~dtfft_utils module~dtfft_interface_cuda_runtime->iso_c_binding module~dtfft_interface_cuda_runtime->module~dtfft_parameters module~dtfft_interface_cuda_runtime->module~dtfft_utils module~dtfft_interface_nvrtc->iso_c_binding module~dtfft_interface_nvrtc->iso_fortran_env module~dtfft_interface_nvrtc->module~dtfft_interface_cuda_runtime module~dtfft_interface_nvrtc->module~dtfft_parameters module~dtfft_interface_nvrtc->module~dtfft_utils module~dtfft_interface_nvtx->iso_c_binding module~dtfft_interface_nvtx->module~dtfft_utils module~dtfft_parameters->iso_c_binding module~dtfft_parameters->iso_fortran_env module~dtfft_parameters->mpi_f08 module~dtfft_utils->iso_c_binding module~dtfft_utils->iso_fortran_env module~dtfft_utils->module~dtfft_parameters module~dtfft_utils->mpi_f08

Used by

  • module~~dtfft_nvrtc_kernel~~UsedByGraph module~dtfft_nvrtc_kernel dtfft_nvrtc_kernel module~dtfft_abstract_backend dtfft_abstract_backend module~dtfft_abstract_backend->module~dtfft_nvrtc_kernel module~dtfft_abstract_transpose_plan dtfft_abstract_transpose_plan module~dtfft_abstract_transpose_plan->module~dtfft_nvrtc_kernel module~dtfft_abstract_transpose_plan->module~dtfft_abstract_backend module~dtfft_plan dtfft_plan module~dtfft_plan->module~dtfft_nvrtc_kernel module~dtfft_plan->module~dtfft_abstract_transpose_plan module~dtfft_transpose_plan_cuda dtfft_transpose_plan_cuda module~dtfft_plan->module~dtfft_transpose_plan_cuda module~dtfft_transpose_plan_host dtfft_transpose_plan_host module~dtfft_plan->module~dtfft_transpose_plan_host module~dtfft_transpose_handle_cuda dtfft_transpose_handle_cuda module~dtfft_transpose_handle_cuda->module~dtfft_nvrtc_kernel module~dtfft_transpose_handle_cuda->module~dtfft_abstract_backend module~dtfft_backend_cufftmp_m dtfft_backend_cufftmp_m module~dtfft_transpose_handle_cuda->module~dtfft_backend_cufftmp_m module~dtfft_backend_mpi dtfft_backend_mpi module~dtfft_transpose_handle_cuda->module~dtfft_backend_mpi module~dtfft_backend_nccl_m dtfft_backend_nccl_m module~dtfft_transpose_handle_cuda->module~dtfft_backend_nccl_m module~dtfft_transpose_plan_cuda->module~dtfft_nvrtc_kernel module~dtfft_transpose_plan_cuda->module~dtfft_abstract_backend module~dtfft_transpose_plan_cuda->module~dtfft_abstract_transpose_plan module~dtfft_transpose_plan_cuda->module~dtfft_transpose_handle_cuda module~dtfft dtfft module~dtfft->module~dtfft_plan module~dtfft_api dtfft_api module~dtfft_api->module~dtfft_plan module~dtfft_backend_cufftmp_m->module~dtfft_abstract_backend module~dtfft_backend_mpi->module~dtfft_abstract_backend module~dtfft_backend_nccl_m->module~dtfft_abstract_backend module~dtfft_transpose_plan_host->module~dtfft_abstract_transpose_plan

Variables

Type Visibility Attributes Name Initial
integer(kind=int32), public, parameter :: DEF_TILE_SIZE = 16

Default tile size

integer(kind=int8), public, parameter :: KERNEL_TRANSPOSE = 1

Basic transpose kernel type.

integer(kind=int8), public, parameter :: KERNEL_TRANSPOSE_PACKED = 2

Transposes data and packs it into contiguous buffer. Should be used only in X-Y 3D plans.

integer(kind=int8), public, parameter :: KERNEL_UNPACK = 3

Unpacks contiguous buffer.

integer(kind=int8), public, parameter :: KERNEL_UNPACK_SIMPLE_COPY = 4

Doesn’t actually unpacks anything. Performs cudaMemcpyAsync call. Should be used only when backend is DTFFT_GPU_BACKEND_CUFFTMP.

integer(kind=int8), public, parameter :: KERNEL_UNPACK_PIPELINED = 5

Unpacks pack of contiguous buffer recieved from rank.

integer(kind=int8), public, parameter :: KERNEL_UNPACK_PARTIAL = 6

Unpacks contiguous buffer recieved from everyone except myself.

integer(kind=int32), private, parameter :: MIN_TILE_SIZE = 8

Minimum tile size. Will launch 2 warps

integer(kind=int32), private, parameter :: TARGET_THREADS_PER_BLOCK = DEF_TILE_SIZE*DEF_TILE_SIZE

Maximum number of threads to run in a block (256)

character(len=*), private, parameter :: DEFAULT_KERNEL_NAME = "dtfft_kernel"

Basic kernel name

integer(kind=int32), private, parameter :: CACHE_PREALLOC_SIZE = 10

Number of preallocated cache entries

type(nvrtc_cache), private, allocatable, save :: cache(:)

Cache of compiled kernels

integer(kind=int32), private, save :: cache_size = 0

Number of entries in cache


Derived Types

type, public ::  nvrtc_kernel

nvRTC Compiled kernel class

Components

Type Visibility Attributes Name Initial
logical, private :: is_created = .false.

Kernel is created flag.

logical, private :: is_dummy = .false.

If kernel should do anything or not.

type(CUfunction), private :: cuda_kernel

Pointer to CUDA kernel.

type(dim3), private :: num_blocks

Grid of blocks.

type(dim3), private :: block_size

Thread block.

integer(kind=int8), private :: kernel_type

Type of kernel to execute.

type(kernelArgs), private :: args

Kernel arguments.

integer(kind=int32), private, allocatable :: pointers(:,:)

Optional pointers that hold info about counts and displacements in KERNEL_UNPACK_PIPELINED kernel.

Type-Bound Procedures

procedure, public, pass(self) :: create ../../

Creates kernel

procedure, public, pass(self) :: execute ../../

Executes kernel

procedure, public, pass(self) :: destroy ../../

Destroys kernel

type, private ::  kernel_code

Class to build CUDA kernel code

Components

Type Visibility Attributes Name Initial
character(len=:), private, allocatable :: raw

String that holds CUDA code

Type-Bound Procedures

procedure, public, pass(self) :: to_cstr ../../

Converts Fortran CUDA code to C pointer

procedure, public, pass(self) :: add_line ../../

Adds new line to CUDA code

procedure, public, pass(self) :: destroy => destroy_code ../../

Frees all memory

type, private ::  nvrtc_cache

Class to cache compiled kernels

Components

Type Visibility Attributes Name Initial
integer(kind=int32), private :: ref_count = 0

Number of references to this kernel

type(CUmodule), private :: cuda_module = CUmodule(c_null_ptr)

Pointer to CUDA Module.

type(CUfunction), private :: cuda_kernel = CUfunction(c_null_ptr)

Pointer to CUDA kernel.

integer(kind=int8), private :: kernel_type

Type of kernel to execute.

type(dtfft_transpose_t), private :: transpose_type

Type of transpose

integer(kind=int32), private :: tile_size

Tile size of transpose kernel

integer(kind=int64), private :: base_storage

Number of bytes needed to store single element

logical, private :: has_inner_loop

If kernel has inner loop


Functions

private function get_tile_size(x, y)

Returns tile size to use in a tranpose kernel

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: x

Number of elements in x direction

integer(kind=int32), intent(in) :: y

Number of elements in y direction

Return Value integer(kind=int32)

private function get_cached_kernel(transpose_type, kernel_type, base_storage, tile_size, has_inner_loop) result(kernel)

Returns cached kernel if it exists. If not returns null pointer.

Arguments

Type IntentOptional Attributes Name
type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

integer(kind=int8), intent(in) :: kernel_type

Type of kernel to build

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

integer(kind=int32), intent(in) :: tile_size

Tile size

logical, intent(in) :: has_inner_loop

If kernel has inner loop

Return Value type(CUfunction)

Cached kernel

private function get_true_transpose_type(transpose_type) result(transpose_type_)

Returns generic transpose id. Since X-Y and Y-Z transpositions are symmectric, it returns only one of them. X-Z and Z-X are not symmetric

Arguments

Type IntentOptional Attributes Name
type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

Return Value type(dtfft_transpose_t)

Fixed id of transposition

private function compile_and_cache(comm, dims, transpose_type, kernel_type, base_storage, tile_size, has_inner_loop) result(kernel)

Compiles kernel and caches it. Returns compiled kernel.

Arguments

Type IntentOptional Attributes Name
type(MPI_Comm), intent(in) :: comm

MPI Communicator

integer(kind=int32), intent(in), target :: dims(:)

Global dimensions to process

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

integer(kind=int8), intent(in) :: kernel_type

Type of kernel to build

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

integer(kind=int32), intent(in) :: tile_size

Tile size

logical, intent(in) :: has_inner_loop

If kernel has inner loop

Return Value type(CUfunction)

Compiled kernel to return

private function get_transpose_kernel_code(kernel_name, ndims, base_storage, transpose_type, enable_packing, enable_multiprocess) result(code)

Generates code that will be used to locally tranpose data and prepares to send it to other processes ndims == 2

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: kernel_name

Name of CUDA kernel

integer(kind=int8), intent(in) :: ndims

Number of dimensions

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(dtfft_transpose_t), intent(in) :: transpose_type

Transpose id

logical, intent(in) :: enable_packing

If data should be manually packed or not

logical, intent(in) :: enable_multiprocess

If thread should process more then one element

Return Value type(kernel_code)

Resulting code

private function get_unpack_kernel_code(kernel_name, base_storage, is_partial) result(code)

Generates code that will be used to unpack data when it is recieved

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: kernel_name

Name of CUDA kernel

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

logical, intent(in) :: is_partial

Return Value type(kernel_code)

Resulting code

private function get_unpack_pipelined_kernel_code(kernel_name, base_storage) result(code)

Generates code that will be used to partially unpack data when it is recieved from other process

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: kernel_name

Name of CUDA kernel

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

Return Value type(kernel_code)

Resulting code


Subroutines

public subroutine clean_unused_cache()

Removes unused modules from cuda context

Arguments

None

private subroutine to_cstr(self, c_code)

Converts Fortran CUDA code to C pointer

Arguments

Type IntentOptional Attributes Name
class(kernel_code), intent(in) :: self

Kernel code

character(len=c_char), intent(out), allocatable :: c_code(:)

C pointer to code

private subroutine add_line(self, line)

Adds new line to CUDA code

Arguments

Type IntentOptional Attributes Name
class(kernel_code), intent(inout) :: self

Kernel code

character(len=*), intent(in) :: line

Line to add

private subroutine destroy_code(self)

Frees all memory

Arguments

Type IntentOptional Attributes Name
class(kernel_code), intent(inout) :: self

Kernel code

private subroutine create_device_pointer(ptr, values)

Allocates memory on a device and copies values to it.

Arguments

Type IntentOptional Attributes Name
type(c_ptr), intent(inout) :: ptr

Device pointer

integer(kind=c_int), intent(in), target :: values(:)

Values to copy

private subroutine get_contiguous_execution_blocks(size, num_blocks, block_sizes)

Arguments

Type IntentOptional Attributes Name
integer(kind=int32), intent(in) :: size

Total amount of iterations required

type(dim3), intent(out) :: num_blocks

Grid of blocks.

type(dim3), intent(out) :: block_sizes

Thread block.

private subroutine create(self, comm, dims, base_storage, transpose_type, kernel_type, pointers)

Creates kernel

Arguments

Type IntentOptional Attributes Name
class(nvrtc_kernel), intent(inout) :: self

nvRTC Compiled kernel class

type(MPI_Comm), intent(in) :: comm

MPI Communicator

integer(kind=int32), intent(in), target :: dims(0:)

Global dimensions to process

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(dtfft_transpose_t), intent(in) :: transpose_type

Type of transposition to perform

integer(kind=int8), intent(in) :: kernel_type

Type of kernel to build

integer(kind=int32), intent(in), optional :: pointers(:,:)

Optional pointers to unpack kernels

private subroutine execute(self, in, out, stream, source)

Executes kernel on stream

Arguments

Type IntentOptional Attributes Name
class(nvrtc_kernel), intent(inout) :: self

nvRTC Compiled kernel class

real(kind=real32), intent(in), target :: in(:)

Source pointer

real(kind=real32), intent(in), target :: out(:)

Target pointer

type(dtfft_stream_t), intent(in) :: stream

CUDA Stream

integer(kind=int32), intent(in), optional :: source

Source rank for pipelined unpacking

private subroutine destroy(self)

Destroys kernel

Arguments

Type IntentOptional Attributes Name
class(nvrtc_kernel), intent(inout) :: self

nvRTC Compiled kernel class

private subroutine mark_unused(kernel)

Takes CUDA kernel as an argument and searches for it in cache If kernel is found than reduces ref_count and return null pointer

Arguments

Type IntentOptional Attributes Name
type(CUfunction), intent(inout) :: kernel

CUDA kernel to search for

private subroutine get_neighbor_function_code(code)

Generated device function that is used to determite id of process that to which data is being sent or from which data has been recieved based on local element coordinate

Arguments

Type IntentOptional Attributes Name
type(kernel_code), intent(inout) :: code

Resulting code

private subroutine get_code_init(kernel_name, base_storage, code, buffer_type)

Generates basic code that is used in all other kernels

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: kernel_name

Name of CUDA kernel

integer(kind=int64), intent(in) :: base_storage

Number of bytes needed to store single element

type(kernel_code), intent(inout) :: code

Resulting code

character(len=:), intent(out), optional, allocatable :: buffer_type

Type of buffer that should be used