This module describes NVRTC Kernel class nvrtc_kernel It uses caching of compiled kernels to avoid recompilation similar kernels: nvrtc_cache
Type | Visibility | Attributes | Name | Initial | |||
---|---|---|---|---|---|---|---|
integer(kind=int32), | public, | parameter | :: | DEF_TILE_SIZE | = | 16 |
Default tile size |
integer(kind=int8), | public, | parameter | :: | KERNEL_TRANSPOSE | = | 1 |
Basic transpose kernel type. |
integer(kind=int8), | public, | parameter | :: | KERNEL_TRANSPOSE_PACKED | = | 2 |
Transposes data and packs it into contiguous buffer. Should be used only in X-Y 3D plans. |
integer(kind=int8), | public, | parameter | :: | KERNEL_UNPACK | = | 3 |
Unpacks contiguous buffer. |
integer(kind=int8), | public, | parameter | :: | KERNEL_UNPACK_SIMPLE_COPY | = | 4 |
Doesn’t actually unpacks anything. Performs |
integer(kind=int8), | public, | parameter | :: | KERNEL_UNPACK_PIPELINED | = | 5 |
Unpacks pack of contiguous buffer recieved from rank. |
integer(kind=int8), | public, | parameter | :: | KERNEL_UNPACK_PARTIAL | = | 6 |
Unpacks contiguous buffer recieved from everyone except myself. |
integer(kind=int32), | private, | parameter | :: | MIN_TILE_SIZE | = | 8 |
Minimum tile size. Will launch 2 warps |
integer(kind=int32), | private, | parameter | :: | TARGET_THREADS_PER_BLOCK | = | DEF_TILE_SIZE*DEF_TILE_SIZE |
Maximum number of threads to run in a block (256) |
character(len=*), | private, | parameter | :: | DEFAULT_KERNEL_NAME | = | "dtfft_kernel" |
Basic kernel name |
integer(kind=int32), | private, | parameter | :: | CACHE_PREALLOC_SIZE | = | 10 |
Number of preallocated cache entries |
type(nvrtc_cache), | private, | allocatable, save | :: | cache(:) |
Cache of compiled kernels |
||
integer(kind=int32), | private, | save | :: | cache_size | = | 0 |
Number of entries in cache |
nvRTC Compiled kernel class
Type | Visibility | Attributes | Name | Initial | |||
---|---|---|---|---|---|---|---|
logical, | private | :: | is_created | = | .false. |
Kernel is created flag. |
|
logical, | private | :: | is_dummy | = | .false. |
If kernel should do anything or not. |
|
type(CUfunction), | private | :: | cuda_kernel |
Pointer to CUDA kernel. |
|||
type(dim3), | private | :: | num_blocks |
Grid of blocks. |
|||
type(dim3), | private | :: | block_size |
Thread block. |
|||
integer(kind=int8), | private | :: | kernel_type |
Type of kernel to execute. |
|||
type(kernelArgs), | private | :: | args |
Kernel arguments. |
|||
integer(kind=int32), | private, | allocatable | :: | pointers(:,:) |
Optional pointers that hold info about counts and displacements
in |
procedure, public, pass(self) :: create | ../../ Creates kernel |
procedure, public, pass(self) :: execute | ../../ Executes kernel |
procedure, public, pass(self) :: destroy | ../../ Destroys kernel |
Class to build CUDA kernel code
Type | Visibility | Attributes | Name | Initial | |||
---|---|---|---|---|---|---|---|
character(len=:), | private, | allocatable | :: | raw |
String that holds CUDA code |
procedure, public, pass(self) :: to_cstr | ../../ Converts Fortran CUDA code to C pointer |
procedure, public, pass(self) :: add_line | ../../ Adds new line to CUDA code |
procedure, public, pass(self) :: destroy => destroy_code | ../../ Frees all memory |
Class to cache compiled kernels
Type | Visibility | Attributes | Name | Initial | |||
---|---|---|---|---|---|---|---|
integer(kind=int32), | private | :: | ref_count | = | 0 |
Number of references to this kernel |
|
type(CUmodule), | private | :: | cuda_module | = | CUmodule(c_null_ptr) |
Pointer to CUDA Module. |
|
type(CUfunction), | private | :: | cuda_kernel | = | CUfunction(c_null_ptr) |
Pointer to CUDA kernel. |
|
integer(kind=int8), | private | :: | kernel_type |
Type of kernel to execute. |
|||
type(dtfft_transpose_t), | private | :: | transpose_type |
Type of transpose |
|||
integer(kind=int32), | private | :: | tile_size |
Tile size of transpose kernel |
|||
integer(kind=int64), | private | :: | base_storage |
Number of bytes needed to store single element |
|||
logical, | private | :: | has_inner_loop |
If kernel has inner loop |
Returns tile size to use in a tranpose kernel
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
integer(kind=int32), | intent(in) | :: | x |
Number of elements in x direction |
||
integer(kind=int32), | intent(in) | :: | y |
Number of elements in y direction |
Returns cached kernel if it exists. If not returns null pointer.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dtfft_transpose_t), | intent(in) | :: | transpose_type |
Type of transposition to perform |
||
integer(kind=int8), | intent(in) | :: | kernel_type |
Type of kernel to build |
||
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
||
integer(kind=int32), | intent(in) | :: | tile_size |
Tile size |
||
logical, | intent(in) | :: | has_inner_loop |
If kernel has inner loop |
Cached kernel
Returns generic transpose id. Since X-Y and Y-Z transpositions are symmectric, it returns only one of them. X-Z and Z-X are not symmetric
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dtfft_transpose_t), | intent(in) | :: | transpose_type |
Type of transposition to perform |
Fixed id of transposition
Compiles kernel and caches it. Returns compiled kernel.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(MPI_Comm), | intent(in) | :: | comm |
MPI Communicator |
||
integer(kind=int32), | intent(in), | target | :: | dims(:) |
Global dimensions to process |
|
type(dtfft_transpose_t), | intent(in) | :: | transpose_type |
Type of transposition to perform |
||
integer(kind=int8), | intent(in) | :: | kernel_type |
Type of kernel to build |
||
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
||
integer(kind=int32), | intent(in) | :: | tile_size |
Tile size |
||
logical, | intent(in) | :: | has_inner_loop |
If kernel has inner loop |
Compiled kernel to return
Generates code that will be used to locally tranpose data and prepares to send it to other processes ndims == 2
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=*), | intent(in) | :: | kernel_name |
Name of CUDA kernel |
||
integer(kind=int8), | intent(in) | :: | ndims |
Number of dimensions |
||
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
||
type(dtfft_transpose_t), | intent(in) | :: | transpose_type |
Transpose id |
||
logical, | intent(in) | :: | enable_packing |
If data should be manually packed or not |
||
logical, | intent(in) | :: | enable_multiprocess |
If thread should process more then one element |
Resulting code
Generates code that will be used to unpack data when it is recieved
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=*), | intent(in) | :: | kernel_name |
Name of CUDA kernel |
||
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
||
logical, | intent(in) | :: | is_partial |
Resulting code
Generates code that will be used to partially unpack data when it is recieved from other process
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=*), | intent(in) | :: | kernel_name |
Name of CUDA kernel |
||
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
Resulting code
Removes unused modules from cuda context
Converts Fortran CUDA code to C pointer
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(kernel_code), | intent(in) | :: | self |
Kernel code |
||
character(len=c_char), | intent(out), | allocatable | :: | c_code(:) |
C pointer to code |
Adds new line to CUDA code
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(kernel_code), | intent(inout) | :: | self |
Kernel code |
||
character(len=*), | intent(in) | :: | line |
Line to add |
Frees all memory
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(kernel_code), | intent(inout) | :: | self |
Kernel code |
Allocates memory on a device and copies values
to it.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(c_ptr), | intent(inout) | :: | ptr |
Device pointer |
||
integer(kind=c_int), | intent(in), | target | :: | values(:) |
Values to copy |
Creates kernel
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(nvrtc_kernel), | intent(inout) | :: | self |
nvRTC Compiled kernel class |
||
type(MPI_Comm), | intent(in) | :: | comm |
MPI Communicator |
||
integer(kind=int32), | intent(in), | target | :: | dims(0:) |
Global dimensions to process |
|
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
||
type(dtfft_transpose_t), | intent(in) | :: | transpose_type |
Type of transposition to perform |
||
integer(kind=int8), | intent(in) | :: | kernel_type |
Type of kernel to build |
||
integer(kind=int32), | intent(in), | optional | :: | pointers(:,:) |
Optional pointers to unpack kernels |
Executes kernel on stream
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(nvrtc_kernel), | intent(inout) | :: | self |
nvRTC Compiled kernel class |
||
real(kind=real32), | intent(in), | target | :: | in(:) |
Source pointer |
|
real(kind=real32), | intent(in), | target | :: | out(:) |
Target pointer |
|
type(dtfft_stream_t), | intent(in) | :: | stream |
CUDA Stream |
||
integer(kind=int32), | intent(in), | optional | :: | source |
Source rank for pipelined unpacking |
Destroys kernel
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(nvrtc_kernel), | intent(inout) | :: | self |
nvRTC Compiled kernel class |
Takes CUDA kernel as an argument and searches for it in cache
If kernel is found than reduces ref_count
and return null pointer
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(CUfunction), | intent(inout) | :: | kernel |
CUDA kernel to search for |
Generated device function that is used to determite id of process that to which data is being sent or from which data has been recieved based on local element coordinate
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(kernel_code), | intent(inout) | :: | code |
Resulting code |
Generates basic code that is used in all other kernels
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=*), | intent(in) | :: | kernel_name |
Name of CUDA kernel |
||
integer(kind=int64), | intent(in) | :: | base_storage |
Number of bytes needed to store single element |
||
type(kernel_code), | intent(inout) | :: | code |
Resulting code |
||
character(len=:), | intent(out), | optional, | allocatable | :: | buffer_type |
Type of buffer that should be used |