dtfft_interface_cuda_runtime.F90 Source File


This file depends on

sourcefile~~dtfft_interface_cuda_runtime.f90~~EfferentGraph sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90

Files dependent on this one

sourcefile~~dtfft_interface_cuda_runtime.f90~~AfferentGraph sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel.f90 dtfft_nvrtc_kernel.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_transpose_plan.f90 dtfft_abstract_transpose_plan.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90 dtfft_executor_cufft_m.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_executor.f90 dtfft_abstract_executor.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvrtc.f90 dtfft_interface_nvrtc.F90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_cufft_m.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90 dtfft_transpose_plan_cuda.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_cuda.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_fftw_m.f90 dtfft_executor_fftw_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_fftw_m.f90 sourcefile~dtfft_executor_vkfft_m.f90 dtfft_executor_vkfft_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_vkfft_m.f90 sourcefile~dtfft_transpose_plan_host.f90 dtfft_transpose_plan_host.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_host.f90 sourcefile~dtfft_executor_mkl_m.f90 dtfft_executor_mkl_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_mkl_m.f90 sourcefile~dtfft_transpose_handle_cuda.f90 dtfft_transpose_handle_cuda.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_transpose_handle_cuda.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_api.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_transpose_handle_host.f90 dtfft_transpose_handle_host.F90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_transpose_handle_host.f90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_abstract_executor.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
module dtfft_interface_cuda_runtime
!! CUDA Runtime Interfaces
use iso_c_binding
use dtfft_parameters, only: dtfft_stream_t
use dtfft_utils
implicit none
private
public :: cudaGetErrorString

public :: dim3
  type, bind(C) :: dim3
  !! Dimension specification type
    integer(c_int) :: x,y,z
  end type

public :: cudaSuccess, cudaErrorNotReady
  enum, bind(c)
    enumerator :: cudaSuccess = 0
    enumerator :: cudaErrorInvalidValue = 1
    enumerator :: cudaErrorMemoryAllocation = 2
    enumerator :: cudaErrorInitializationError = 3
    enumerator :: cudaErrorCudartUnloading = 4
    enumerator :: cudaErrorProfilerDisabled = 5
    enumerator :: cudaErrorProfilerNotInitialized = 6
    enumerator :: cudaErrorProfilerAlreadyStarted = 7
    enumerator :: cudaErrorProfilerAlreadyStopped = 8
    enumerator :: cudaErrorInvalidConfiguration = 9
    enumerator :: cudaErrorInvalidPitchValue = 12
    enumerator :: cudaErrorInvalidSymbol = 13
    enumerator :: cudaErrorInvalidHostPointer = 16
    enumerator :: cudaErrorInvalidDevicePointer = 17
    enumerator :: cudaErrorInvalidTexture = 18
    enumerator :: cudaErrorInvalidTextureBinding = 19
    enumerator :: cudaErrorInvalidChannelDescriptor = 20
    enumerator :: cudaErrorInvalidMemcpyDirection = 21
    enumerator :: cudaErrorAddressOfConstant = 22
    enumerator :: cudaErrorTextureFetchFailed = 23
    enumerator :: cudaErrorTextureNotBound = 24
    enumerator :: cudaErrorSynchronizationError = 25
    enumerator :: cudaErrorInvalidFilterSetting = 26
    enumerator :: cudaErrorInvalidNormSetting = 27
    enumerator :: cudaErrorMixedDeviceExecution = 28
    enumerator :: cudaErrorNotYetImplemented = 31
    enumerator :: cudaErrorMemoryValueTooLarge = 32
    enumerator :: cudaErrorInsufficientDriver = 35
    enumerator :: cudaErrorInvalidSurface = 37
    enumerator :: cudaErrorDuplicateVariableName = 43
    enumerator :: cudaErrorDuplicateTextureName = 44
    enumerator :: cudaErrorDuplicateSurfaceName = 45
    enumerator :: cudaErrorDevicesUnavailable = 46
    enumerator :: cudaErrorIncompatibleDriverContext = 49
    enumerator :: cudaErrorMissingConfiguration = 52
    enumerator :: cudaErrorPriorLaunchFailure = 53
    enumerator :: cudaErrorLaunchMaxDepthExceeded = 65
    enumerator :: cudaErrorLaunchFileScopedTex = 66
    enumerator :: cudaErrorLaunchFileScopedSurf = 67
    enumerator :: cudaErrorSyncDepthExceeded = 68
    enumerator :: cudaErrorLaunchPendingCountExceeded = 69
    enumerator :: cudaErrorInvalidDeviceFunction = 98
    enumerator :: cudaErrorNoDevice = 100
    enumerator :: cudaErrorInvalidDevice = 101
    enumerator :: cudaErrorStartupFailure = 127
    enumerator :: cudaErrorInvalidKernelImage = 200
    enumerator :: cudaErrorDeviceUninitialized = 201
    enumerator :: cudaErrorMapBufferObjectFailed = 205
    enumerator :: cudaErrorUnmapBufferObjectFailed = 206
    enumerator :: cudaErrorArrayIsMapped = 207
    enumerator :: cudaErrorAlreadyMapped = 208
    enumerator :: cudaErrorNoKernelImageForDevice = 209
    enumerator :: cudaErrorAlreadyAcquired = 210
    enumerator :: cudaErrorNotMapped = 211
    enumerator :: cudaErrorNotMappedAsArray = 212
    enumerator :: cudaErrorNotMappedAsPointer = 213
    enumerator :: cudaErrorECCUncorrectable = 214
    enumerator :: cudaErrorUnsupportedLimit = 215
    enumerator :: cudaErrorDeviceAlreadyInUse = 216
    enumerator :: cudaErrorPeerAccessUnsupported = 217
    enumerator :: cudaErrorInvalidPtx = 218
    enumerator :: cudaErrorInvalidGraphicsContext = 219
    enumerator :: cudaErrorNvlinkUncorrectable = 220
    enumerator :: cudaErrorJitCompilerNotFound = 221
    enumerator :: cudaErrorInvalidSource = 300
    enumerator :: cudaErrorFileNotFound = 301
    enumerator :: cudaErrorSharedObjectSymbolNotFound = 302
    enumerator :: cudaErrorSharedObjectInitFailed = 303
    enumerator :: cudaErrorOperatingSystem = 304
    enumerator :: cudaErrorInvalidResourceHandle = 400
    enumerator :: cudaErrorIllegalState = 401
    enumerator :: cudaErrorSymbolNotFound = 500
    enumerator :: cudaErrorNotReady = 600
    enumerator :: cudaErrorIllegalAddress = 700
    enumerator :: cudaErrorLaunchOutOfResources = 701
    enumerator :: cudaErrorLaunchTimeout = 702
    enumerator :: cudaErrorLaunchIncompatibleTexturing = 703
    enumerator :: cudaErrorPeerAccessAlreadyEnabled = 704
    enumerator :: cudaErrorPeerAccessNotEnabled = 705
    enumerator :: cudaErrorSetOnActiveProcess = 708
    enumerator :: cudaErrorContextIsDestroyed = 709
    enumerator :: cudaErrorAssert = 710
    enumerator :: cudaErrorTooManyPeers = 711
    enumerator :: cudaErrorHostMemoryAlreadyRegistered = 712
    enumerator :: cudaErrorHostMemoryNotRegistered = 713
    enumerator :: cudaErrorHardwareStackError = 714
    enumerator :: cudaErrorIllegalInstruction = 715
    enumerator :: cudaErrorMisalignedAddress = 716
    enumerator :: cudaErrorInvalidAddressSpace = 717
    enumerator :: cudaErrorInvalidPc = 718
    enumerator :: cudaErrorLaunchFailure = 719
    enumerator :: cudaErrorCooperativeLaunchTooLarge = 720
    enumerator :: cudaErrorNotPermitted = 800
    enumerator :: cudaErrorNotSupported = 801
    enumerator :: cudaErrorSystemNotReady = 802
    enumerator :: cudaErrorSystemDriverMismatch = 803
    enumerator :: cudaErrorCompatNotSupportedOnDevice = 804
    enumerator :: cudaErrorStreamCaptureUnsupported = 900
    enumerator :: cudaErrorStreamCaptureInvalidated = 901
    enumerator :: cudaErrorStreamCaptureMerge = 902
    enumerator :: cudaErrorStreamCaptureUnmatched = 903
    enumerator :: cudaErrorStreamCaptureUnjoined = 904
    enumerator :: cudaErrorStreamCaptureIsolation = 905
    enumerator :: cudaErrorStreamCaptureImplicit = 906
    enumerator :: cudaErrorCapturedEvent = 907
    enumerator :: cudaErrorStreamCaptureWrongThread = 908
    enumerator :: cudaErrorTimeout = 909
    enumerator :: cudaErrorGraphExecUpdateFailure = 910
    enumerator :: cudaErrorUnknown = 999
    enumerator :: cudaErrorApiFailureBase = 10000
  end enum

public :: cudaMemcpyHostToHost,     &
          cudaMemcpyHostToDevice,   &
          cudaMemcpyDeviceToHost,   &
          cudaMemcpyDeviceToDevice, &
          cudaMemcpyDefault
  enum, bind(C)
    enumerator :: cudaMemcpyHostToHost = 0
    enumerator :: cudaMemcpyHostToDevice = 1
    enumerator :: cudaMemcpyDeviceToHost = 2
    enumerator :: cudaMemcpyDeviceToDevice = 3
    enumerator :: cudaMemcpyDefault = 4
  end enum



public :: cudaEvent
  type, bind(C) :: cudaEvent
  !! CUDA event types
    type(c_ptr) :: event  !! Handle
  end type cudaEvent

  integer(c_int), parameter, public :: cudaEventDisableTiming = 2


public :: cudaStreamQuery
  interface
  !! Queries an asynchronous stream for completion status.
    function cudaStreamQuery(stream)                              &
      result(cudaError_t)                                         &
      bind(C, name="cudaStreamQuery")
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if all operations in stream have completed, 
                                                  !! or `cudaErrorNotReady` if not.
    end function cudaStreamQuery
  endinterface

public :: cudaStreamCreate
  interface
  !! Creates an asynchronous stream.
    function cudaStreamCreate(stream)                              &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamCreate")
    import
      type(dtfft_stream_t)        :: stream       !! Pointer to the created stream
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream was created successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaStreamCreate
  end interface

public :: cudaStreamDestroy
  interface
    !! Destroys an asynchronous stream.
    function cudaStreamDestroy(stream)                             &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamDestroy")
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream was destroyed successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaStreamDestroy
  end interface

public :: cudaStreamSynchronize
  interface
    !! Waits for stream tasks to complete.
    function cudaStreamSynchronize(stream)                         &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamSynchronize")
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream tasks completed successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaStreamSynchronize
  end interface

  interface
  !! Returns the string representation of an error code.
    function cudaGetErrorString_c(errcode)                           &
      result(string)                                               &
      bind(C, name="cudaGetErrorString")
    import
      integer(c_int), value :: errcode  !! Error code
      type(c_ptr)           :: string   !! Pointer to the error string
    end function cudaGetErrorString_c
  end interface

public :: cudaMalloc
  interface
    !! Allocates memory on the device.
    function cudaMalloc(ptr, count)                                &
      result(cudaError_t)                                          &
      bind(C, name="cudaMalloc")
    import
      type(c_ptr)               :: ptr          !! Pointer to allocated device memory
      integer(c_size_t), value  :: count        !! Requested allocation size in bytes
      integer(c_int)            :: cudaError_t  !! Returns `cudaSuccess` if memory was allocated successfully, 
                                                !! or `cudaErrorMemoryAllocation` if the memory could not be allocated.
    end function cudaMalloc
  end interface

public :: cudaFree
  interface
  !! Frees memory on the device.
    function cudaFree(ptr)                                         &
      result(cudaError_t)                                          &
      bind(C, name="cudaFree")
    import
      type(c_ptr), value :: ptr         !! Pointer to memory to free
      integer(c_int)     :: cudaError_t !! Returns `cudaSuccess` if memory was freed successfully, 
                                        !! or an error code if there was an issue.
    end function cudaFree
  end interface

public :: cudaMemset
  interface
  !! Initializes or sets device memory to a value.
    function cudaMemset(ptr, val, count)                           &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemset")
    import
      type(c_ptr),        value :: ptr          !! Pointer to device memory
      integer(c_int),     value :: val          !! Value to set
      integer(c_size_t),  value :: count        !! Size in bytes to set
      integer(c_int)            :: cudaError_t  !! Returns `cudaSuccess` if the memory was set successfully, 
                                                !! or an error code if there was an issue.
    end function cudaMemset
  end interface

public :: cudaEventCreateWithFlags
  interface
  !! Creates an event with the specified flags.
    function cudaEventCreateWithFlags(event, flags)                &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventCreateWithFlags")
    import
      type(cudaEvent)             :: event        !! Event identifier
      integer(c_int),   value     :: flags        !! Flags for event creation
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the event was created successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaEventCreateWithFlags
  end interface

public :: cudaEventRecord
  interface
  !! Records an event in a stream.
    function cudaEventRecord(event, stream)                        &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventRecord")
    import
      type(cudaEvent),      value :: event        !! Event identifier
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the event was recorded successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaEventRecord
  end interface

public :: cudaStreamWaitEvent
  interface
  !! Makes a stream wait on an event.
    function cudaStreamWaitEvent(stream, event, flags)             &
      result(cudaError_t)                                          &
      bind(C, name="cudaStreamWaitEvent")
    import
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      type(cudaEvent),      value :: event        !! Event identifier
      integer(c_int),       value :: flags        !! Flags for the wait operation
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream is waiting successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaStreamWaitEvent
  end interface

public :: cudaEventDestroy
  interface
  !! Destroys an event.
    function cudaEventDestroy(event)                               &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventDestroy")
    import
      type(cudaEvent), value :: event       !! Event identifier
      integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the event was destroyed successfully, 
                                            !! or an error code if there was an issue.
    end function cudaEventDestroy
  end interface

public :: cudaEventCreate
  interface
  !! Creates an event.
    function cudaEventCreate(event)                                &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventCreate")
    import
      type(cudaEvent) :: event        !! Event identifier
      integer(c_int)  :: cudaError_t  !! Returns `cudaSuccess` if the event was created successfully, 
                                      !! or an error code if there was an issue.
    end function cudaEventCreate
  end interface

public :: cudaEventSynchronize
  interface
  !! Waits for an event to complete.
    function cudaEventSynchronize(event)                           &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventSynchronize")
    import
      type(cudaEvent), value :: event       !! Event identifier
      integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the event completed successfully, 
                                            !! or an error code if there was an issue.
    end function cudaEventSynchronize
  end interface

public :: cudaEventElapsedTime
  interface
  !! Computes the elapsed time between two events.
    function cudaEventElapsedTime(time, start, end)                &
      result(cudaError_t)                                          &
      bind(C, name="cudaEventElapsedTime")
    import
      real(c_float)          :: time        !! Elapsed time in milliseconds
      type(cudaEvent), value :: start       !! Starting event
      type(cudaEvent), value :: end         !! Ending event
      integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the elapsed time was computed successfully, 
                                            !! or an error code if there was an issue.
    end function cudaEventElapsedTime
  end interface

public :: cudaMemcpyAsync
  interface cudaMemcpyAsync
    !! Copies data asynchronously between host and device.
    function cudaMemcpyAsync_ptr(dst, src, count, kdir, stream)    &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpyAsync")
    import
      type(c_ptr),          value :: dst          !! Destination pointer
      type(c_ptr),          value :: src          !! Source pointer
      integer(c_size_t),    value :: count        !! Size in bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy (host-to-device, device-to-host, etc.)
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was initiated successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaMemcpyAsync_ptr

    function cudaMemcpyAsync_r32(dst, src, count, kdir, stream)    &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpyAsync")
    import
      real(c_float)               :: dst          !! Destination array (32-bit float)
      real(c_float)               :: src          !! Source array (32-bit float)
      integer(c_size_t),    value :: count        !! Number of elements to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      type(dtfft_stream_t), value :: stream       !! Stream identifier
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was initiated successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaMemcpyAsync_r32
  end interface

public :: cudaMemcpy
  interface cudaMemcpy
    !! Copies data synchronously between host and device.
    function cudaMemcpy_ptr(dst, src, count, kdir)                 &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpy")
    import
      type(c_ptr),          value :: dst          !! Destination pointer
      type(c_ptr),          value :: src          !! Source pointer
      integer(c_size_t),    value :: count        !! Size in bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaMemcpy_ptr

    function cudaMemcpy_r64(dst, src, count, kdir)                 &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpy")
    import
      real(c_double)              :: dst(*)       !! Destination array (64-bit float)
      real(c_double)              :: src(*)       !! Source array (64-bit float)
      integer(c_size_t),    value :: count        !! Number of bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaMemcpy_r64

    function cudaMemcpy_r32(dst, src, count, kdir)                 &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemcpy")
    import
      real(c_float)               :: dst(*)       !! Destination array (32-bit float)
      real(c_float)               :: src(*)       !! Source array (32-bit float)
      integer(c_size_t),    value :: count        !! Number of bytes to copy
      integer(c_int),       value :: kdir         !! Direction of copy
      integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully, 
                                                  !! or an error code if there was an issue.
    end function cudaMemcpy_r32
  end interface

public :: cudaGetDevice
  interface
    !! Returns the current device.
    function cudaGetDevice(num)                                    &
      result(cudaError_t)                                          &
      bind(C, name="cudaGetDevice")
    import
      integer(c_int) :: num         !! Device number
      integer(c_int) :: cudaError_t !! Returns `cudaSuccess` if the device was retrieved successfully, 
                                    !! or an error code if there was an issue.
    end function cudaGetDevice
  end interface

public :: cudaGetDeviceCount
  interface
    !! Returns the number of available devices.
    function cudaGetDeviceCount(num)                               &
      result(cudaError_t)                                          &
      bind(C, name="cudaGetDeviceCount")
    import
      integer(c_int) :: num         !! Number of devices
      integer(c_int) :: cudaError_t !! Returns `cudaSuccess` if the device count was retrieved successfully, 
                                    !! or an error code if there was an issue.
    end function cudaGetDeviceCount
  end interface

public :: cudaSetDevice
  interface
    !! Sets the current device.
    function cudaSetDevice(num)                                    &
      result(cudaError_t)                                          &
      bind(C, name="cudaSetDevice")
    import
      integer(c_int), value :: num          !! Device number
      integer(c_int)        :: cudaError_t  !! Returns `cudaSuccess` if the device was set successfully, 
                                            !! or an error code if there was an issue.
    end function cudaSetDevice
  end interface

public :: get_cuda_architecture
  interface
    !! Returns the CUDA architecture for a given device.
    subroutine get_cuda_architecture(device, major, minor)         &
      bind(C, name="get_cuda_architecture")
    import
      integer(c_int), value :: device  !! Device number
      integer(c_int)        :: major   !! Major compute capability
      integer(c_int)        :: minor   !! Minor compute capability
    end subroutine get_cuda_architecture
  end interface

public :: cudaMemGetInfo
  interface
    !! Returns the amount of free and total memory on the device.
    function cudaMemGetInfo(free, total)                           &
      result(cudaError_t)                                          &
      bind(C, name="cudaMemGetInfo")
    import
      integer(c_size_t)   :: free         !! Free memory in bytes
      integer(c_size_t)   :: total        !! Total memory in bytes
      integer(c_int)      :: cudaError_t  !! Returns `cudaSuccess` if the memory information was retrieved successfully, 
                                          !! or an error code if there was an issue.
    end function cudaMemGetInfo
  end interface

public :: cudaDeviceSynchronize
    interface
    !! Synchronizes the device, blocking until all preceding tasks in all streams have completed.
      function cudaDeviceSynchronize()                               &
        result(cudaError_t)                                          &
        bind(C, name="cudaDeviceSynchronize")
      import
        integer(c_int)    :: cudaError_t  !! Returns `cudaSuccess` if the device was set successfully, 
                                          !! or an error code if there was an issue.
      end function cudaDeviceSynchronize
    end interface


contains

  function cudaGetErrorString(errcode) result(string)
  !! Helper function that returns a string describing the given nvrtcResult code
  !! If the error code is not recognized, "unrecognized error code" is returned.
    integer(c_int),   intent(in)  :: errcode        !! CUDA Runtime Compilation API result code.
    character(len=:), allocatable :: string         !! Result string

    call string_c2f(cudaGetErrorString_c(errcode), string)
  end function cudaGetErrorString

end module dtfft_interface_cuda_runtime