dtfft_interface_cuda.F90 Source File


This file depends on

sourcefile~~dtfft_interface_cuda.f90~~EfferentGraph sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90

Files dependent on this one

sourcefile~~dtfft_interface_cuda.f90~~AfferentGraph sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_kernel_device.f90 dtfft_kernel_device.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module_cache.f90 dtfft_nvrtc_module_cache.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_module_cache.f90 sourcefile~dtfft_nvrtc_module.f90 dtfft_nvrtc_module.F90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_module.f90 sourcefile~dtfft_reshape_plan_base.f90 dtfft_reshape_plan_base.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_reshape_handle_generic.f90 dtfft_reshape_handle_generic.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_reshape_handle_generic.f90 sourcefile~test_device_kernels.f90 test_device_kernels.F90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_reshape_plan.f90 dtfft_reshape_plan.F90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_reshape_plan_base.f90 sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_reshape_plan_base.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_reshape_plan.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_interface_cuda
!! CUDA Driver Interfaces
!!
!! CUDA Driver is loaded at runtime via dynamic loading.
use iso_c_binding
use iso_fortran_env
use dtfft_errors,                 only: DTFFT_SUCCESS
use dtfft_parameters,             only: dtfft_stream_t
use dtfft_utils
#include "_dtfft_mpi.h"
implicit none
private
#include "_dtfft_private.h"
public :: load_cuda
public :: cuLaunchKernel

public :: dim3
  type, bind(C) :: dim3
  !! Dimension specification type
    integer(c_int) :: x,y,z
  end type

  integer(int32), parameter, public :: MAX_KERNEL_ARGS = 9

public :: CUmodule
  type, bind(C) :: CUmodule
  !! CUDA module
    type(c_ptr) :: ptr  !! Actual pointer
  end type CUmodule

public :: CUfunction
#ifdef DTFFT_WITH_MOCK_ENABLED
  abstract interface
#ifndef DTFFT_WITH_OPENMP
    pure &
#endif
    subroutine simple_interface_r32(in, out, dims)
    import
      real(real32),     intent(in)    :: in(BUFFER_SPEC)            !! Source host-allocated buffer
      real(real32),     intent(inout) :: out(BUFFER_SPEC)           !! Target host-allocated buffer
      integer(int32),   intent(in)    :: dims(:)                    !! Dimensions of the array
    end subroutine simple_interface_r32

#ifndef DTFFT_WITH_OPENMP
    pure &
#endif
    subroutine simple_interface_r64(in, out, dims)
    import
      real(real64),     intent(in)    :: in(BUFFER_SPEC)            !! Source host-allocated buffer
      real(real64),     intent(inout) :: out(BUFFER_SPEC)           !! Target host-allocated buffer
      integer(int32),   intent(in)    :: dims(:)                    !! Dimensions of the array
    end subroutine simple_interface_r64

#ifndef DTFFT_WITH_OPENMP
    pure &
#endif
    subroutine simple_interface_r128(in, out, dims)
    import
      complex(real64),  intent(in)    :: in(BUFFER_SPEC)            !! Source host-allocated buffer
      complex(real64),  intent(inout) :: out(BUFFER_SPEC)           !! Target host-allocated buffer
      integer(int32),   intent(in)    :: dims(:)                    !! Dimensions of the array
    end subroutine simple_interface_r128

#ifndef DTFFT_WITH_OPENMP
    pure &
#endif
    subroutine pipe_interface_r32(in, out, dims, locals)
    import
      real(real32),     intent(in)    :: in(BUFFER_SPEC)            !! Source host-allocated buffer
      real(real32),     intent(inout) :: out(BUFFER_SPEC)           !! Target host-allocated buffer
      integer(int32),   intent(in)    :: dims(:)                    !! Dimensions of the array
      integer(int32),   intent(in)    :: locals(:)                  !! Local memory size specification
    end subroutine pipe_interface_r32

#ifndef DTFFT_WITH_OPENMP
    pure &
#endif
    subroutine pipe_interface_r64(in, out, dims, locals)
    import
      real(real64),     intent(in)    :: in(BUFFER_SPEC)            !! Source host-allocated buffer
      real(real64),     intent(inout) :: out(BUFFER_SPEC)           !! Target host-allocated buffer
      integer(int32),   intent(in)    :: dims(:)                    !! Dimensions of the array
      integer(int32),   intent(in)    :: locals(:)                  !! Local memory size specification
    end subroutine pipe_interface_r64

#ifndef DTFFT_WITH_OPENMP
    pure &
#endif
    subroutine pipe_interface_r128(in, out, dims, locals)
    import
      complex(real64),  intent(in)    :: in(BUFFER_SPEC)            !! Source host-allocated buffer
      complex(real64),  intent(inout) :: out(BUFFER_SPEC)           !! Target host-allocated buffer
      integer(int32),   intent(in)    :: dims(:)                    !! Dimensions of the array
      integer(int32),   intent(in)    :: locals(:)                  !! Local memory size specification
    end subroutine pipe_interface_r128
  end interface

  type :: CUfunction
  !! CUDA function (mock)
    procedure(simple_interface_r32), pointer,  nopass :: sfun_r32 => null()  !! Pointer to the Fortran subroutine implementing the kernel
    procedure(simple_interface_r64), pointer,  nopass :: sfun_r64 => null()  !! Pointer to the Fortran subroutine implementing the kernel
    procedure(simple_interface_r128), pointer, nopass :: sfun_r128 => null()  !! Pointer to the Fortran subroutine implementing the kernel
    procedure(pipe_interface_r32), pointer,  nopass :: pfun_r32 => null()  !! Pointer to the Fortran subroutine implementing the pipelined kernel
    procedure(pipe_interface_r64), pointer,  nopass :: pfun_r64 => null()  !! Pointer to the Fortran subroutine implementing the pipelined kernel
    procedure(pipe_interface_r128), pointer, nopass :: pfun_r128 => null()  !! Pointer to the Fortran subroutine implementing the pipelined kernel
  end type CUfunction
#else
  type, bind(C) :: CUfunction
  !! CUDA function
    type(c_ptr) :: ptr  !! Actual pointer
  end type CUfunction
#endif

  logical,        save :: is_loaded = .false.
    !! Flag indicating whether the library is loaded
  type(c_ptr),    save :: libcuda
    !! Handle to the loaded library
  type(c_funptr), save :: cuFunctions(4)
    !! Array of pointers to the CUDA functions

#ifndef DTFFT_WITH_MOCK_ENABLED
! Real CUDA Driver interfaces with dynamic loading

  abstract interface
    function cuModuleLoadData_interface(mod, image)                                                   &
      result(cuResult)
    !! Load a module's data with options.
    !!
    !! Takes a pointer image and loads the corresponding module module into the current context.
    !! The image may be a cubin or fatbin as output by nvcc, or a NULL-terminated PTX, either as output by nvcc or hand-written.
    import
      type(CUmodule)        :: mod          !! Returned module
      type(c_ptr),    value :: image        !! Module data to load
      integer(c_int)        :: cuResult     !! Driver result code
    end function cuModuleLoadData_interface

    function cuModuleUnload_interface(hmod)                                                           &
      result(cuResult)
    !! Unloads a module.
    !!
    !! Unloads a module ``hmod`` from the current context.
    !! Attempting to unload a module which was obtained from the Library Management API
    !! such as ``cuLibraryGetModule`` will return ``CUDA_ERROR_NOT_PERMITTED``.
    import
      type(CUmodule), value :: hmod         !! Module to unload
      integer(c_int)        :: cuResult     !! Driver result code
    end function cuModuleUnload_interface

    function cuModuleGetFunction_interface(hfunc, hmod, name)                                          &
      result(cuResult)
    !! Returns a function handle.
    !!
    !! Returns in ``hfunc`` the handle of the function of name name located in module hmod.
    !! If no function of that name exists, ``cuModuleGetFunction`` returns ``CUDA_ERROR_NOT_FOUND``.
    import
      type(CUfunction)      :: hfunc        !! Returns a function handle.
      type(CUmodule), value :: hmod         !! Module to retrieve function from
      type(c_ptr),    value :: name         !! Name of function to retrieve
      integer(c_int)        :: cuResult     !! Driver result code
    end function cuModuleGetFunction_interface

    function cuLaunchKernel_interface(func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra)                                          &
      result(cuResult)
    !! Launches a CUDA function CUfunction.
    import
      type(CUfunction),     value :: func               !! CUDA function to launch
      integer(c_int),       value :: gridDimX           !! Grid dimensions in X
      integer(c_int),       value :: gridDimY           !! Grid dimensions in Y
      integer(c_int),       value :: gridDimZ           !! Grid dimensions in Z
      integer(c_int),       value :: blockDimX          !! Block dimensions in X
      integer(c_int),       value :: blockDimY          !! Block dimensions in Y
      integer(c_int),       value :: blockDimZ          !! Block dimensions in Z
      integer(c_int),       value :: sharedMemBytes     !! Dynamic shared memory size
      type(dtfft_stream_t), value :: stream             !! Stream identifier
      type(c_ptr)                 :: kernelParams(*)    !! Array of pointers to kernel parameters
      type(c_ptr)                 :: extra              !! Dynamic shared-memory size per thread block in bytes
      integer(c_int)              :: cuResult           !! Driver result code
    end function cuLaunchKernel_interface
  end interface

  procedure(cuModuleLoadData_interface),     pointer, public  :: cuModuleLoadData
    !! Fortran pointer to the cuModuleLoadData function
  procedure(cuModuleUnload_interface),       pointer, public  :: cuModuleUnload
    !! Fortran pointer to the cuModuleUnload function
  procedure(cuModuleGetFunction_interface),  pointer, public  :: cuModuleGetFunction
    !! Fortran pointer to the cuModuleGetFunction function
  procedure(cuLaunchKernel_interface),       pointer          :: cuLaunchKernel_
    !! Fortran pointer to the cuLaunchKernel function

#else
! Mock CUDA Driver interfaces for CPU testing
public :: cuModuleLoadData, cuModuleUnload, cuModuleGetFunction
#endif

contains

#ifdef DTFFT_WITH_MOCK_ENABLED
  ! Mock implementations for CPU testing

  function cuModuleLoadData(mod, image) result(cuResult)
  !! Mock: Creates dummy module
    type(CUmodule)         :: mod
    type(c_ptr), intent(in) :: image
    integer(c_int)         :: cuResult
    mod%ptr = image
    cuResult = 0  ! CUDA_SUCCESS
  end function cuModuleLoadData

  function cuModuleUnload(hmod) result(cuResult)
  !! Mock: Does nothing
    type(CUmodule), intent(in) :: hmod
    integer(c_int)             :: cuResult
    cuResult = 0  ! CUDA_SUCCESS
  end function cuModuleUnload

  function cuModuleGetFunction(hfunc, hmod, name) result(cuResult)
  !! Mock: Returns dummy function handle
    type(CUfunction)           :: hfunc
    type(CUmodule),  intent(in) :: hmod
    type(c_ptr),     intent(in) :: name
    integer(c_int)             :: cuResult
    ! hfunc%ptr = name
    cuResult = 0  ! CUDA_SUCCESS
  end function cuModuleGetFunction

  function load_cuda() result(error_code)
  !! Mock: Does nothing, always returns success
    integer(int32)  :: error_code
    type(string), allocatable :: func_names(:)
    integer(int32)  :: ierr

    error_code = DTFFT_SUCCESS
    if ( is_loaded ) return
    allocate(func_names(1))
    func_names(1) = string("dtfft_execute")

    ! Just try loading both libs
    ierr = dynamic_load("libdtfft.so", func_names, libcuda, cuFunctions)
    if ( ierr /= DTFFT_SUCCESS ) then
      ierr = dynamic_load("libdtfft.dylib", func_names, libcuda, cuFunctions)
    endif

    call destroy_strings(func_names)
    if ( ierr == DTFFT_SUCCESS ) then
      call unload_library(libcuda)
    endif
    is_loaded = .true.
  end function load_cuda

#else
  ! Real CUDA Driver implementation with dynamic loading

  function load_cuda() result(error_code)
  !! Loads the CUDA Driver library and needed symbols
    integer(int32)  :: error_code !! Error code
    type(string), allocatable :: func_names(:)

    error_code = DTFFT_SUCCESS
    if ( is_loaded ) return

    allocate(func_names(4))
    func_names(1) = string("cuModuleLoadData")
    func_names(2) = string("cuModuleUnload")
    func_names(3) = string("cuModuleGetFunction")
    func_names(4) = string("cuLaunchKernel")

    error_code = dynamic_load("libcuda.so", func_names, libcuda, cuFunctions)
    call destroy_strings(func_names)
    if ( error_code /= DTFFT_SUCCESS ) return

    call c_f_procpointer(cuFunctions(1), cuModuleLoadData)
    call c_f_procpointer(cuFunctions(2), cuModuleUnload)
    call c_f_procpointer(cuFunctions(3), cuModuleGetFunction)
    call c_f_procpointer(cuFunctions(4), cuLaunchKernel_)

    is_loaded = .true.
  end function load_cuda

#endif

  function cuLaunchKernel(func, in, out, blocks, threads, stream, nargs, args) result(cuResult)
  !! Launches a CUDA kernel
    type(CUfunction),         intent(in)  :: func             !! Function CUfunction or Kernel CUkernel to launch
    type(c_ptr),      target, intent(in)  :: in               !! Input pointer
    type(c_ptr),      target, intent(in)  :: out              !! Output pointer
    type(dim3),               intent(in)  :: blocks           !! Grid in blocks
    type(dim3),               intent(in)  :: threads          !! Thread block
    type(dtfft_stream_t),     intent(in)  :: stream           !! Stream identifier
    integer(int32),           intent(in)  :: nargs
    integer(int32),   target, intent(in)  :: args(MAX_KERNEL_ARGS)     !! Input parameters of kernel `func`
    integer(c_int)                        :: cuResult         !! Driver result code
    type(c_ptr)                           :: params(15)
    integer(int32) :: i, temp
    integer(int32) :: dims(3), locals(5)

#ifndef DTFFT_WITH_MOCK_ENABLED
    params(:) = c_null_ptr
    ! Addresses of pointers are required, not the pointers themselves
    params(1) = c_loc(out)
    params(2) = c_loc(in)

    temp = 2
    do i = 1, nargs
      params(temp + i) = c_loc(args(i))
    enddo
    cuResult = cuLaunchKernel_(func, blocks%x, blocks%y, blocks%z, threads%x, threads%y, threads%z, 0, stream, params, c_null_ptr)
#else
    dims(:) = 1
    locals(:) = 0

    if ( associated(func%sfun_r32) .or. associated(func%sfun_r64) .or. associated(func%sfun_r128)) then
      do i = 1, nargs
        dims(i) = args(i)
      enddo

      if ( associated(func%sfun_r32) ) then
        block
          real(real32), pointer, contiguous :: in_ptr(:), out_ptr(:)

          call c_f_pointer(in, in_ptr, [product(dims)])
          call c_f_pointer(out, out_ptr, [product(dims)])
          call func%sfun_r32(in_ptr, out_ptr, dims)
        endblock
      else if ( associated(func%sfun_r64) ) then
        block
          real(real64), pointer, contiguous :: in_ptr(:), out_ptr(:)

          call c_f_pointer(in, in_ptr, [product(dims)])
          call c_f_pointer(out, out_ptr, [product(dims)])
          call func%sfun_r64(in_ptr, out_ptr, dims)
        endblock
      else if ( associated(func%sfun_r128) ) then
        block
          complex(real64), pointer, contiguous :: in_ptr(:), out_ptr(:)

          call c_f_pointer(in, in_ptr, [product(dims)])
          call c_f_pointer(out, out_ptr, [product(dims)])
          call func%sfun_r128(in_ptr, out_ptr, dims)
        endblock
      endif
    else if ( associated(func%pfun_r32) .or. associated(func%pfun_r64) .or. associated(func%pfun_r128)) then
      if ( args(3) == -1 )then
        dims(1:2) = args(1:2)
        locals(1:5) = args(3:7)
      else
        dims(1:3) = args(1:3)
        locals(1:5) = args(4:8)
      endif
      if ( associated(func%pfun_r32) ) then
        block
          real(real32), pointer, contiguous :: in_ptr(:), out_ptr(:)

          call c_f_pointer(in, in_ptr, [product(dims)])
          call c_f_pointer(out, out_ptr, [product(dims)])
          
          call func%pfun_r32(in_ptr, out_ptr, dims, locals)
        endblock
      else if ( associated(func%pfun_r64) ) then
        block
          real(real64), pointer, contiguous :: in_ptr(:), out_ptr(:)

          call c_f_pointer(in, in_ptr, [product(dims)])
          call c_f_pointer(out, out_ptr, [product(dims)])
          call func%pfun_r64(in_ptr, out_ptr, dims, locals)
        endblock
      else if ( associated(func%pfun_r128) ) then
        block
          complex(real64), pointer, contiguous :: in_ptr(:), out_ptr(:)

          call c_f_pointer(in, in_ptr, [product(dims)])
          call c_f_pointer(out, out_ptr, [product(dims)])
          call func%pfun_r128(in_ptr, out_ptr, dims, locals)
        endblock
      endif
    else
      INTERNAL_ERROR("cuLaunchKernel: invalid function handle")
    endif

    cuResult = DTFFT_SUCCESS
#endif
  end function cuLaunchKernel
end module dtfft_interface_cuda