dtfft_interface_cuda_runtime.F90 Source File


This file depends on

sourcefile~~dtfft_interface_cuda_runtime.f90~~EfferentGraph sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90

Files dependent on this one

sourcefile~~dtfft_interface_cuda_runtime.f90~~AfferentGraph sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_kernel.f90 dtfft_nvrtc_kernel.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_transpose_plan.f90 dtfft_abstract_transpose_plan.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90 dtfft_executor_cufft_m.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_executor.f90 dtfft_abstract_executor.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90 dtfft_nvrtc_block_optimizer.F90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_kernel_cache.f90 dtfft_nvrtc_kernel_cache.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_nvrtc_kernel_cache.f90 sourcefile~dtfft_nvrtc_kernel_generator.f90 dtfft_nvrtc_kernel_generator.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_nvrtc_kernel_generator.f90 sourcefile~dtfft_nvrtc_kernel_cache.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel_cache.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_kernel_generator.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel_generator.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_cufft_m.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90 dtfft_transpose_plan_cuda.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_cuda.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_fftw_m.f90 dtfft_executor_fftw_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_fftw_m.f90 sourcefile~dtfft_executor_vkfft_m.f90 dtfft_executor_vkfft_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_vkfft_m.f90 sourcefile~dtfft_transpose_plan_host.f90 dtfft_transpose_plan_host.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_host.f90 sourcefile~dtfft_executor_mkl_m.f90 dtfft_executor_mkl_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_mkl_m.f90 sourcefile~dtfft_transpose_handle_cuda.f90 dtfft_transpose_handle_cuda.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_nvrtc_kernel_cache.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_transpose_handle_cuda.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_api.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_transpose_handle_host.f90 dtfft_transpose_handle_host.F90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_transpose_handle_host.f90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_abstract_executor.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
module dtfft_interface_cuda_runtime
!! CUDA Runtime Interfaces
use iso_c_binding
use dtfft_parameters, only: dtfft_stream_t
use dtfft_utils,      only: string_c2f
implicit none
private
public :: cudaGetErrorString

public :: cudaSuccess, cudaErrorNotReady
  enum, bind(c)
    enumerator :: cudaSuccess = 0
    enumerator :: cudaErrorInvalidValue = 1
    enumerator :: cudaErrorMemoryAllocation = 2
    enumerator :: cudaErrorInitializationError = 3
    enumerator :: cudaErrorCudartUnloading = 4
    enumerator :: cudaErrorProfilerDisabled = 5
    enumerator :: cudaErrorProfilerNotInitialized = 6
    enumerator :: cudaErrorProfilerAlreadyStarted = 7
    enumerator :: cudaErrorProfilerAlreadyStopped = 8
    enumerator :: cudaErrorInvalidConfiguration = 9
    enumerator :: cudaErrorInvalidPitchValue = 12
    enumerator :: cudaErrorInvalidSymbol = 13
    enumerator :: cudaErrorInvalidHostPointer = 16
    enumerator :: cudaErrorInvalidDevicePointer = 17
    enumerator :: cudaErrorInvalidTexture = 18
    enumerator :: cudaErrorInvalidTextureBinding = 19
    enumerator :: cudaErrorInvalidChannelDescriptor = 20
    enumerator :: cudaErrorInvalidMemcpyDirection = 21
    enumerator :: cudaErrorAddressOfConstant = 22
    enumerator :: cudaErrorTextureFetchFailed = 23
    enumerator :: cudaErrorTextureNotBound = 24
    enumerator :: cudaErrorSynchronizationError = 25
    enumerator :: cudaErrorInvalidFilterSetting = 26
    enumerator :: cudaErrorInvalidNormSetting = 27
    enumerator :: cudaErrorMixedDeviceExecution = 28
    enumerator :: cudaErrorNotYetImplemented = 31
    enumerator :: cudaErrorMemoryValueTooLarge = 32
    enumerator :: cudaErrorInsufficientDriver = 35
    enumerator :: cudaErrorInvalidSurface = 37
    enumerator :: cudaErrorDuplicateVariableName = 43
    enumerator :: cudaErrorDuplicateTextureName = 44
    enumerator :: cudaErrorDuplicateSurfaceName = 45
    enumerator :: cudaErrorDevicesUnavailable = 46
    enumerator :: cudaErrorIncompatibleDriverContext = 49
    enumerator :: cudaErrorMissingConfiguration = 52
    enumerator :: cudaErrorPriorLaunchFailure = 53
    enumerator :: cudaErrorLaunchMaxDepthExceeded = 65
    enumerator :: cudaErrorLaunchFileScopedTex = 66
    enumerator :: cudaErrorLaunchFileScopedSurf = 67
    enumerator :: cudaErrorSyncDepthExceeded = 68
    enumerator :: cudaErrorLaunchPendingCountExceeded = 69
    enumerator :: cudaErrorInvalidDeviceFunction = 98
    enumerator :: cudaErrorNoDevice = 100
    enumerator :: cudaErrorInvalidDevice = 101
    enumerator :: cudaErrorStartupFailure = 127
    enumerator :: cudaErrorInvalidKernelImage = 200
    enumerator :: cudaErrorDeviceUninitialized = 201
    enumerator :: cudaErrorMapBufferObjectFailed = 205
    enumerator :: cudaErrorUnmapBufferObjectFailed = 206
    enumerator :: cudaErrorArrayIsMapped = 207
    enumerator :: cudaErrorAlreadyMapped = 208
    enumerator :: cudaErrorNoKernelImageForDevice = 209
    enumerator :: cudaErrorAlreadyAcquired = 210
    enumerator :: cudaErrorNotMapped = 211
    enumerator :: cudaErrorNotMappedAsArray = 212
    enumerator :: cudaErrorNotMappedAsPointer = 213
    enumerator :: cudaErrorECCUncorrectable = 214
    enumerator :: cudaErrorUnsupportedLimit = 215
    enumerator :: cudaErrorDeviceAlreadyInUse = 216
    enumerator :: cudaErrorPeerAccessUnsupported = 217
    enumerator :: cudaErrorInvalidPtx = 218
    enumerator :: cudaErrorInvalidGraphicsContext = 219
    enumerator :: cudaErrorNvlinkUncorrectable = 220
    enumerator :: cudaErrorJitCompilerNotFound = 221
    enumerator :: cudaErrorInvalidSource = 300
    enumerator :: cudaErrorFileNotFound = 301
    enumerator :: cudaErrorSharedObjectSymbolNotFound = 302
    enumerator :: cudaErrorSharedObjectInitFailed = 303
    enumerator :: cudaErrorOperatingSystem = 304
    enumerator :: cudaErrorInvalidResourceHandle = 400
    enumerator :: cudaErrorIllegalState = 401
    enumerator :: cudaErrorSymbolNotFound = 500
    enumerator :: cudaErrorNotReady = 600
    enumerator :: cudaErrorIllegalAddress = 700
    enumerator :: cudaErrorLaunchOutOfResources = 701
    enumerator :: cudaErrorLaunchTimeout = 702
    enumerator :: cudaErrorLaunchIncompatibleTexturing = 703
    enumerator :: cudaErrorPeerAccessAlreadyEnabled = 704
    enumerator :: cudaErrorPeerAccessNotEnabled = 705
    enumerator :: cudaErrorSetOnActiveProcess = 708
    enumerator :: cudaErrorContextIsDestroyed = 709
    enumerator :: cudaErrorAssert = 710
    enumerator :: cudaErrorTooManyPeers = 711
    enumerator :: cudaErrorHostMemoryAlreadyRegistered = 712
    enumerator :: cudaErrorHostMemoryNotRegistered = 713
    enumerator :: cudaErrorHardwareStackError = 714
    enumerator :: cudaErrorIllegalInstruction = 715
    enumerator :: cudaErrorMisalignedAddress = 716
    enumerator :: cudaErrorInvalidAddressSpace = 717
    enumerator :: cudaErrorInvalidPc = 718
    enumerator :: cudaErrorLaunchFailure = 719
    enumerator :: cudaErrorCooperativeLaunchTooLarge = 720
    enumerator :: cudaErrorNotPermitted = 800
    enumerator :: cudaErrorNotSupported = 801
    enumerator :: cudaErrorSystemNotReady = 802
    enumerator :: cudaErrorSystemDriverMismatch = 803
    enumerator :: cudaErrorCompatNotSupportedOnDevice = 804
    enumerator :: cudaErrorStreamCaptureUnsupported = 900
    enumerator :: cudaErrorStreamCaptureInvalidated = 901
    enumerator :: cudaErrorStreamCaptureMerge = 902
    enumerator :: cudaErrorStreamCaptureUnmatched = 903
    enumerator :: cudaErrorStreamCaptureUnjoined = 904
    enumerator :: cudaErrorStreamCaptureIsolation = 905
    enumerator :: cudaErrorStreamCaptureImplicit = 906
    enumerator :: cudaErrorCapturedEvent = 907
    enumerator :: cudaErrorStreamCaptureWrongThread = 908
    enumerator :: cudaErrorTimeout = 909
    enumerator :: cudaErrorGraphExecUpdateFailure = 910
    enumerator :: cudaErrorUnknown = 999
    enumerator :: cudaErrorApiFailureBase = 10000
  end enum

public :: cudaMemcpyHostToHost,     &
          cudaMemcpyHostToDevice,   &
          cudaMemcpyDeviceToHost,   &
          cudaMemcpyDeviceToDevice, &
          cudaMemcpyDefault
  enum, bind(C)
    enumerator :: cudaMemcpyHostToHost = 0
    enumerator :: cudaMemcpyHostToDevice = 1
    enumerator :: cudaMemcpyDeviceToHost = 2
    enumerator :: cudaMemcpyDeviceToDevice = 3
    enumerator :: cudaMemcpyDefault = 4
  end enum



public :: cudaEvent
  type, bind(C) :: cudaEvent
  !! CUDA event types
    type(c_ptr) :: event  !! Handle
  end type cudaEvent

  integer(c_int), parameter, public :: cudaEventDisableTiming = 2


public :: cudaStreamQuery
  interface
    function cudaStreamQuery(stream)                              &
      result(cudaError_t)                                         &
      bind(C, name="cudaStreamQuery")
    !! Queries an asynchronous stream for completion status.
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if all operations in stream have completed,
                                                  !! or `cudaErrorNotReady` if not.
    end function cudaStreamQuery
  endinterface

public :: cudaStreamCreate
  interface
    function cudaStreamCreate(stream)                              &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamCreate")
    !! Creates an asynchronous stream.
    import
      type(dtfft_stream_t)        :: stream       !! Pointer to the created stream
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream was created successfully,
                                                  !! or an error code if there was an issue.
    end function cudaStreamCreate
  end interface

public :: cudaStreamDestroy
  interface
    function cudaStreamDestroy(stream)                             &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamDestroy")
    !! Destroys an asynchronous stream.
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream was destroyed successfully,
                                                  !! or an error code if there was an issue.
    end function cudaStreamDestroy
  end interface

public :: cudaStreamSynchronize
  interface
    function cudaStreamSynchronize(stream)                         &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamSynchronize")
    !! Waits for stream tasks to complete.
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream tasks completed successfully,
                                                  !! or an error code if there was an issue.
    end function cudaStreamSynchronize
  end interface

  interface
    function cudaGetErrorString_c(errcode)                         &
      result(string)                                               &
      bind(C, name="cudaGetErrorString")
    !! Returns the string representation of an error code.
    import
      integer(c_int), value :: errcode  !! Error code
      type(c_ptr)           :: string   !! Pointer to the error string
    end function cudaGetErrorString_c
  end interface

public :: cudaMalloc
  interface
    function cudaMalloc(ptr, count)                                &
      result(cudaError_t)                                          &
      bind(C, name="cudaMalloc")
    !! Allocates memory on the device.
    import
      type(c_ptr)               :: ptr          !! Pointer to allocated device memory
      integer(c_size_t), value  :: count        !! Requested allocation size in bytes
      integer(c_int)            :: cudaError_t  !! Returns `cudaSuccess` if memory was allocated successfully,
                                                !! or `cudaErrorMemoryAllocation` if the memory could not be allocated.
    end function cudaMalloc
  end interface

public :: cudaFree
  interface
    function cudaFree(ptr)                                         &
      result(cudaError_t)                                          &
      bind(C, name="cudaFree")
    !! Frees memory on the device.
    import
      type(c_ptr), value :: ptr         !! Pointer to memory to free
      integer(c_int)     :: cudaError_t !! Returns `cudaSuccess` if memory was freed successfully,
                                        !! or an error code if there was an issue.
    end function cudaFree
  end interface

public :: cudaMemset
  interface
    function cudaMemset(ptr, val, count)                           &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemset")
    !! Initializes or sets device memory to a value.
    import
      type(c_ptr),        value :: ptr          !! Pointer to device memory
      integer(c_int),     value :: val          !! Value to set
      integer(c_size_t),  value :: count        !! Size in bytes to set
      integer(c_int)            :: cudaError_t  !! Returns `cudaSuccess` if the memory was set successfully,
                                                !! or an error code if there was an issue.
    end function cudaMemset
  end interface

public :: cudaEventCreateWithFlags
  interface
    function cudaEventCreateWithFlags(event, flags)                &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventCreateWithFlags")
    !! Creates an event with the specified flags.
    import
      type(cudaEvent)             :: event        !! Event identifier
      integer(c_int),   value     :: flags        !! Flags for event creation
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the event was created successfully,
                                                  !! or an error code if there was an issue.
    end function cudaEventCreateWithFlags
  end interface

public :: cudaEventRecord
  interface
    function cudaEventRecord(event, stream)                        &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventRecord")
    !! Records an event in a stream.
    import
      type(cudaEvent),      value :: event        !! Event identifier
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the event was recorded successfully,
                                                  !! or an error code if there was an issue.
    end function cudaEventRecord
  end interface

public :: cudaStreamWaitEvent
  interface
    function cudaStreamWaitEvent(stream, event, flags)             &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamWaitEvent")
    !! Makes a stream wait on an event.
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      type(cudaEvent),      value :: event        !! Event identifier
      integer(c_int),       value :: flags        !! Flags for the wait operation
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream is waiting successfully,
                                                  !! or an error code if there was an issue.
    end function cudaStreamWaitEvent
  end interface

public :: cudaEventDestroy
  interface
    function cudaEventDestroy(event)                               &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventDestroy")
    !! Destroys an event.
    import
      type(cudaEvent), value :: event       !! Event identifier
      integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the event was destroyed successfully,
                                            !! or an error code if there was an issue.
    end function cudaEventDestroy
  end interface

public :: cudaEventCreate
  interface
    function cudaEventCreate(event)                                &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventCreate")
    !! Creates an event.
    import
      type(cudaEvent) :: event        !! Event identifier
      integer(c_int)  :: cudaError_t  !! Returns `cudaSuccess` if the event was created successfully,
                                      !! or an error code if there was an issue.
    end function cudaEventCreate
  end interface

public :: cudaEventSynchronize
  interface
    function cudaEventSynchronize(event)                           &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventSynchronize")
    !! Waits for an event to complete.
    import
      type(cudaEvent), value :: event       !! Event identifier
      integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the event completed successfully,
                                            !! or an error code if there was an issue.
    end function cudaEventSynchronize
  end interface

public :: cudaEventElapsedTime
  interface
    function cudaEventElapsedTime(time, start, end)                &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventElapsedTime")
    !! Computes the elapsed time between two events.
    import
      real(c_float)          :: time        !! Elapsed time in milliseconds
      type(cudaEvent), value :: start       !! Starting event
      type(cudaEvent), value :: end         !! Ending event
      integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the elapsed time was computed successfully,
                                            !! or an error code if there was an issue.
    end function cudaEventElapsedTime
  end interface

public :: cudaMemcpyAsync
  interface cudaMemcpyAsync
    !! Copies data asynchronously between host and device.
    function cudaMemcpyAsync_ptr(dst, src, count, kdir, stream)    &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpyAsync")
    import
      type(c_ptr),          value :: dst          !! Destination pointer
      type(c_ptr),          value :: src          !! Source pointer
      integer(c_size_t),    value :: count        !! Size in bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy (host-to-device, device-to-host, etc.)
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was initiated successfully,
                                                  !! or an error code if there was an issue.
    end function cudaMemcpyAsync_ptr

    function cudaMemcpyAsync_r32(dst, src, count, kdir, stream)    &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpyAsync")
    import
      real(c_float)               :: dst          !! Destination array (32-bit float)
      real(c_float)               :: src          !! Source array (32-bit float)
      integer(c_size_t),    value :: count        !! Number of elements to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was initiated successfully,
                                                  !! or an error code if there was an issue.
    end function cudaMemcpyAsync_r32
  end interface

public :: cudaMemcpy
  interface cudaMemcpy
    !! Copies data synchronously between host and device.
    function cudaMemcpy_ptr(dst, src, count, kdir)                 &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpy")
    import
      type(c_ptr),          value :: dst          !! Destination pointer
      type(c_ptr),          value :: src          !! Source pointer
      integer(c_size_t),    value :: count        !! Size in bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully,
                                                  !! or an error code if there was an issue.
    end function cudaMemcpy_ptr

    function cudaMemcpy_r64(dst, src, count, kdir)                 &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpy")
    import
      real(c_double)              :: dst(*)       !! Destination array (64-bit float)
      real(c_double)              :: src(*)       !! Source array (64-bit float)
      integer(c_size_t),    value :: count        !! Number of bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully,
                                                  !! or an error code if there was an issue.
    end function cudaMemcpy_r64

    function cudaMemcpy_r32(dst, src, count, kdir)                 &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpy")
    import
      real(c_float)               :: dst(*)       !! Destination array (32-bit float)
      real(c_float)               :: src(*)       !! Source array (32-bit float)
      integer(c_size_t),    value :: count        !! Number of bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully,
                                                  !! or an error code if there was an issue.
    end function cudaMemcpy_r32
  end interface

public :: cudaGetDevice
  interface
    function cudaGetDevice(num)                                    &
      result(cudaError_t)                                          &
      bind(C, name="cudaGetDevice")
    !! Returns the current device.
    import
      integer(c_int) :: num         !! Device number
      integer(c_int) :: cudaError_t !! Returns `cudaSuccess` if the device was retrieved successfully,
                                    !! or an error code if there was an issue.
    end function cudaGetDevice
  end interface

public :: cudaGetDeviceCount
  interface
    function cudaGetDeviceCount(num)                               &
      result(cudaError_t)                                          &
      bind(C, name="cudaGetDeviceCount")
    !! Returns the number of available devices.
    import
      integer(c_int) :: num         !! Number of devices
      integer(c_int) :: cudaError_t !! Returns `cudaSuccess` if the device count was retrieved successfully,
                                    !! or an error code if there was an issue.
    end function cudaGetDeviceCount
  end interface

public :: cudaSetDevice
  interface
    function cudaSetDevice(num)                                    &
      result(cudaError_t)                                          &
      bind(C, name="cudaSetDevice")
    !! Sets the current device.
    import
      integer(c_int), value :: num          !! Device number
      integer(c_int)        :: cudaError_t  !! Returns `cudaSuccess` if the device was set successfully,
                                            !! or an error code if there was an issue.
    end function cudaSetDevice
  end interface

public :: cudaMemGetInfo
  interface
    function cudaMemGetInfo(free, total)                           &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemGetInfo")
    !! Returns the amount of free and total memory on the device.
    import
      integer(c_size_t)   :: free         !! Free memory in bytes
      integer(c_size_t)   :: total        !! Total memory in bytes
      integer(c_int)      :: cudaError_t  !! Returns `cudaSuccess` if the memory information was retrieved successfully,
                                          !! or an error code if there was an issue.
    end function cudaMemGetInfo
  end interface

public :: cudaDeviceSynchronize
    interface
      function cudaDeviceSynchronize()                             &
        result(cudaError_t)                                        &
        bind(C, name="cudaDeviceSynchronize")
      !! Synchronizes the device, blocking until all preceding tasks in all streams have completed.
      import
        integer(c_int)    :: cudaError_t  !! Returns `cudaSuccess` if syncronization was
                                          !! or an error code if there was an issue.
      end function cudaDeviceSynchronize
    end interface

public :: cudaGetLastError
    interface
      function cudaGetLastError()                                  &
        result(cudaError_t)                                        &
        bind(C, name="cudaGetLastError")
      !! Returns the last error from a runtime call.
      import
        integer(c_int)    :: cudaError_t  !! Returns `cudaSuccess` if no error was detected
                                          !! or an error code if there was an issue.
      end function cudaGetLastError
    end interface

public :: device_props
  type, bind(C) :: device_props
  !! GPU device properties obtained from cudaDeviceProp
    integer(c_int)    :: sm_count                   !! Number of multiprocessors on device (cudaDeviceProp.multiProcessorCount)
    integer(c_int)    :: max_threads_per_sm         !! Maximum resident threads per multiprocessor (cudaDeviceProp.maxThreadsPerMultiProcessor)
    integer(c_int)    :: max_blocks_per_sm          !! Maximum number of resident blocks per multiprocessor (cudaDeviceProp.maxBlocksPerMultiProcessor)
    integer(c_size_t) :: shared_mem_per_sm          !! Shared memory per multiprocessor (cudaDeviceProp.sharedMemPerMultiprocessor)
    integer(c_int)    :: max_threads_per_block      !! Maximum number of threads per block (cudaDeviceProp.maxThreadsPerBlock)
    integer(c_size_t) :: shared_mem_per_block       !! Shared memory available per block in bytes (cudaDeviceProp.sharedMemPerBlock)
    integer(c_int)    :: l2_cache_size              !! Size of L2 cache in bytes (cudaDeviceProp.l2CacheSize)
    integer(c_int)    :: compute_capability_major   !! Major compute capability (cudaDeviceProp.major)
    integer(c_int)    :: compute_capability_minor   !! Minor compute capability (cudaDeviceProp.minor)
  end type device_props

public :: get_device_props
  interface
    subroutine get_device_props(device, props)         &
      bind(C, name="get_device_props_cuda")
    !! Returns the CUDA device properties for a given device number.
    import
      integer(c_int), value   :: device   !! Device number
      type(device_props)      :: props    !! GPU Properties
    end subroutine get_device_props
  end interface


contains

  function cudaGetErrorString(errcode) result(string)
  !! Helper function that returns a string describing the given nvrtcResult code
  !! If the error code is not recognized, "unrecognized error code" is returned.
    integer(c_int),   intent(in)  :: errcode        !! CUDA Runtime Compilation API result code.
    character(len=:), allocatable :: string         !! Result string

    call string_c2f(cudaGetErrorString_c(errcode), string)
  end function cudaGetErrorString

end module dtfft_interface_cuda_runtime