dtfft_interface_cuda.F90 Source File


This file depends on

sourcefile~~dtfft_interface_cuda.f90~~EfferentGraph sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90

Files dependent on this one

sourcefile~~dtfft_interface_cuda.f90~~AfferentGraph sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_nvrtc_block_optimizer.f90 dtfft_nvrtc_block_optimizer.F90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_kernel.f90 dtfft_nvrtc_kernel.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_kernel_cache.f90 dtfft_nvrtc_kernel_cache.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_nvrtc_kernel_cache.f90 sourcefile~dtfft_nvrtc_kernel_generator.f90 dtfft_nvrtc_kernel_generator.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_nvrtc_kernel_generator.f90 sourcefile~dtfft_nvrtc_kernel_cache.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_kernel_generator.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_kernel_generator.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_transpose_plan_cuda.f90 dtfft_transpose_plan_cuda.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_nvrtc_kernel_cache.f90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_transpose_plan.f90 dtfft_abstract_transpose_plan.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_handle_cuda.f90 dtfft_transpose_handle_cuda.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_transpose_handle_cuda.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_cuda.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_host.f90 dtfft_transpose_plan_host.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_host.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_abstract_transpose_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
module dtfft_interface_cuda
!! CUDA Driver Interfaces
!!
!! CUDA Driver is loaded at runtime via dynamic loading.
use iso_c_binding
use iso_fortran_env,              only: int32
use dtfft_errors,                 only: DTFFT_SUCCESS
use dtfft_parameters,             only: dtfft_stream_t
use dtfft_utils,                  only: string, dynamic_load, destroy_strings
implicit none
private
#include "_dtfft_private.h"
public :: load_cuda
public :: cuLaunchKernel

public :: dim3
  type, bind(C) :: dim3
  !! Dimension specification type
    integer(c_int) :: x,y,z
  end type

public :: kernelArgs
  type, bind(C) :: kernelArgs
  !! Arguments passed to nvrtc-compiled kernels
    ! integer(c_int)    :: n_longs = 0
    ! integer(c_size_t) :: longs(1)
    integer(c_int)    :: n_ints = 0   !! Number of integers provided
    integer(c_int)    :: ints(5)      !! Integer array
    integer(c_int)    :: n_ptrs = 0   !! Number of pointers provided
    type(c_ptr)       :: ptrs(3)      !! Pointer array
  end type kernelArgs

public :: CUmodule
  type, bind(C) :: CUmodule
  !! CUDA module
    type(c_ptr) :: ptr  !! Actual pointer
  end type CUmodule

public :: CUfunction
  type, bind(C) :: CUfunction
  !! CUDA function
    type(c_ptr) :: ptr  !! Actual pointer
  end type CUfunction

  abstract interface
    function cuModuleLoadData_interface(mod, image)                                                   &
      result(cuResult)
    !! Load a module's data with options.
    !!
    !! Takes a pointer image and loads the corresponding module module into the current context. 
    !! The image may be a cubin or fatbin as output by nvcc, or a NULL-terminated PTX, either as output by nvcc or hand-written.
    import
      type(CUmodule)        :: mod          !! Returned module
      character(c_char)     :: image(*)     !! Module data to load
      integer(c_int)        :: cuResult     !! Driver result code
    end function cuModuleLoadData_interface

    function cuModuleUnload_interface(hmod)                                                           &
      result(cuResult)
    !! Unloads a module.
    !!
    !! Unloads a module ``hmod`` from the current context. 
    !! Attempting to unload a module which was obtained from the Library Management API 
    !! such as ``cuLibraryGetModule`` will return ``CUDA_ERROR_NOT_PERMITTED``.
    import
      type(CUmodule), value :: hmod         !! Module to unload
      integer(c_int)        :: cuResult     !! Driver result code
    end function cuModuleUnload_interface

    function cuModuleGetFunction_interface(hfunc, hmod, name)                                          &
      result(cuResult)
    !! Returns a function handle.
    !!
    !! Returns in ``hfunc`` the handle of the function of name name located in module hmod.
    !! If no function of that name exists, ``cuModuleGetFunction`` returns ``CUDA_ERROR_NOT_FOUND``.
    import
      type(CUfunction)      :: hfunc        !! Returns a function handle.
      type(CUmodule), value :: hmod         !! Module to retrieve function from
      character(c_char)     :: name(*)      !! Name of function to retrieve
      integer(c_int)        :: cuResult     !! Driver result code
    end function cuModuleGetFunction_interface

    function cuLaunchKernel_interface(func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra)                                          &
      result(cuResult)
    !! Launches a CUDA function CUfunction.
    import
      type(CUfunction),     value :: func               !! CUDA function to launch
      integer(c_int),       value :: gridDimX           !! Grid dimensions in X
      integer(c_int),       value :: gridDimY           !! Grid dimensions in Y
      integer(c_int),       value :: gridDimZ           !! Grid dimensions in Z
      integer(c_int),       value :: blockDimX          !! Block dimensions in X
      integer(c_int),       value :: blockDimY          !! Block dimensions in Y
      integer(c_int),       value :: blockDimZ          !! Block dimensions in Z
      integer(c_int),       value :: sharedMemBytes     !! Dynamic shared memory size
      type(dtfft_stream_t), value :: stream             !! Stream identifier
      type(c_ptr)                 :: kernelParams(*)    !! Array of pointers to kernel parameters
      type(c_ptr)                 :: extra              !! Dynamic shared-memory size per thread block in bytes
      integer(c_int)              :: cuResult           !! Driver result code
    end function cuLaunchKernel_interface
  end interface

  logical,        save :: is_loaded = .false.
    !! Flag indicating whether the library is loaded
  type(c_ptr),    save :: libcuda
    !! Handle to the loaded library
  type(c_funptr), save :: cuFunctions(4)
    !! Array of pointers to the CUDA functions

  procedure(cuModuleLoadData_interface),     pointer, public  :: cuModuleLoadData
    !! Fortran pointer to the cuModuleLoadData function
  procedure(cuModuleUnload_interface),       pointer, public  :: cuModuleUnload
    !! Fortran pointer to the cuModuleUnload function
  procedure(cuModuleGetFunction_interface),  pointer, public  :: cuModuleGetFunction
    !! Fortran pointer to the cuModuleGetFunction function
  procedure(cuLaunchKernel_interface),       pointer          :: cuLaunchKernel_
    !! Fortran pointer to the cuLaunchKernel function
contains

  function load_cuda() result(error_code)
  !! Loads the CUDA Driver library and needed symbols
    integer(int32)  :: error_code !! Error code
    type(string), allocatable :: func_names(:)

    error_code = DTFFT_SUCCESS
    if ( is_loaded ) return

    allocate(func_names(4))
    func_names(1) = string("cuModuleLoadData")
    func_names(2) = string("cuModuleUnload")
    func_names(3) = string("cuModuleGetFunction")
    func_names(4) = string("cuLaunchKernel")

    error_code = dynamic_load("libcuda.so", func_names, libcuda, cuFunctions)
    call destroy_strings(func_names)
    if ( error_code /= DTFFT_SUCCESS ) return

    call c_f_procpointer(cuFunctions(1), cuModuleLoadData)
    call c_f_procpointer(cuFunctions(2), cuModuleUnload)
    call c_f_procpointer(cuFunctions(3), cuModuleGetFunction)
    call c_f_procpointer(cuFunctions(4), cuLaunchKernel_)

    is_loaded = .true.
  end function load_cuda

  function cuLaunchKernel(func, in, out, blocks, threads, stream, kernelParams) result(cuResult)
  !! Launches a CUDA kernel
    type(CUfunction),         intent(in)  :: func             !! Function CUfunction or Kernel CUkernel to launch
    type(c_ptr),      target, intent(in)  :: in               !! Input pointer
    type(c_ptr),      target, intent(in)  :: out              !! Output pointer
    type(dim3),               intent(in)  :: blocks           !! Grid in blocks
    type(dim3),               intent(in)  :: threads          !! Thread block
    type(dtfft_stream_t),     intent(in)  :: stream           !! Stream identifier
    type(kernelArgs), target, intent(in)  :: kernelParams     !! Input parameters of kernel `func`
    integer(c_int)                        :: cuResult         !! Driver result code
    type(c_ptr)                           :: args(10)
    ! integer(int32), pointer :: param
    integer(int32) :: i, temp

    args(:) = c_null_ptr
    ! Addresses of pointers are required, not the pointers themselves
    args(1) = c_loc(out)
    args(2) = c_loc(in)

    temp = 2
    ! do i = 1, kernelParams%n_longs
    !   args(temp + i) = c_loc(kernelParams%longs(i))
    ! enddo

    ! temp = temp + kernelParams%n_longs
    do i = 1, kernelParams%n_ints
      args(temp + i) = c_loc(kernelParams%ints(i))
    enddo

    temp = temp + kernelParams%n_ints
    do i = 1, kernelParams%n_ptrs
      args(temp + i) = c_loc(kernelParams%ptrs(i))
    enddo
    cuResult = cuLaunchKernel_(func, blocks%x, blocks%y, blocks%z, threads%x, threads%y, threads%z, 0, stream, args, c_null_ptr)
  end function cuLaunchKernel
end module dtfft_interface_cuda