dtfft_interface_cuda_runtime.F90 Source File


This file depends on

sourcefile~~dtfft_interface_cuda_runtime.f90~~EfferentGraph sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90

Files dependent on this one

sourcefile~~dtfft_interface_cuda_runtime.f90~~AfferentGraph sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_kernel.f90 dtfft_abstract_kernel.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_compressor_zfp.f90 dtfft_compressor_zfp.F90 sourcefile~dtfft_compressor_zfp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_compressor_zfp.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90 dtfft_executor_cufft_m.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_executor.f90 dtfft_abstract_executor.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_kernel_device.f90 dtfft_kernel_device.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90 dtfft_nvrtc_block_optimizer.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module_cache.f90 dtfft_nvrtc_module_cache.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_module_cache.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module.f90 dtfft_nvrtc_module.F90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_module.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_cufft_m.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_fftw_m.f90 dtfft_executor_fftw_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_fftw_m.f90 sourcefile~dtfft_executor_vkfft_m.f90 dtfft_executor_vkfft_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_vkfft_m.f90 sourcefile~dtfft_reshape_plan.f90 dtfft_reshape_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_reshape_plan.f90 sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan.f90 sourcefile~dtfft_executor_mkl_m.f90 dtfft_executor_mkl_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_mkl_m.f90 sourcefile~dtfft_reshape_plan_base.f90 dtfft_reshape_plan_base.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_reshape_handle.f90 dtfft_abstract_reshape_handle.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_abstract_reshape_handle.f90 sourcefile~dtfft_reshape_handle_datatype.f90 dtfft_reshape_handle_datatype.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_reshape_handle_datatype.f90 sourcefile~dtfft_reshape_handle_generic.f90 dtfft_reshape_handle_generic.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_reshape_handle_generic.f90 sourcefile~test_device_kernels.f90 test_device_kernels.F90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_config.f90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_kernel_host.f90 dtfft_kernel_host.F90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_reshape_handle.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_reshape_handle.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_reshape_handle.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_api.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_reshape_handle_datatype.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_reshape_handle_datatype.f90->sourcefile~dtfft_abstract_reshape_handle.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_abstract_reshape_handle.f90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_reshape_plan_base.f90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_abstract_reshape_handle.f90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_reshape_plan_base.f90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_abstract_reshape_handle.f90 sourcefile~test_compression.f90 test_compression.F90 sourcefile~test_compression.f90->sourcefile~dtfft_compressor_zfp.f90 sourcefile~test_compression.f90->sourcefile~dtfft_config.f90 sourcefile~test_compression.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~test_compression.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~test_host_kernels.f90 test_host_kernels.F90 sourcefile~test_host_kernels.f90->sourcefile~dtfft_kernel_host.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_interface_cuda_runtime
!! CUDA Runtime Interfaces
use iso_c_binding
use iso_fortran_env
use dtfft_parameters
use dtfft_utils
#ifdef DTFFT_WITH_MOCK_ENABLED
#include "_dtfft_mpi.h"
#include "_dtfft_private.h"
#endif
implicit none
private
public :: cudaGetErrorString
public :: cudaStreamQuery, cudaStreamCreate, cudaStreamDestroy, cudaStreamSynchronize
public :: cudaMalloc, cudaFree, cudaMemset
public :: cudaEventCreateWithFlags, cudaEventRecord, cudaStreamWaitEvent
public :: cudaEventDestroy, cudaEventCreate, cudaEventSynchronize, cudaEventElapsedTime
public :: cudaMemcpyAsync, cudaMemcpy
public :: cudaGetDevice, cudaGetDeviceCount, cudaSetDevice
public :: cudaMemGetInfo, cudaDeviceSynchronize, cudaGetLastError
public :: get_device_props

public :: cudaSuccess, cudaErrorNotReady
    enum, bind(c)
        enumerator :: cudaSuccess = 0
        enumerator :: cudaErrorInvalidValue = 1
        enumerator :: cudaErrorMemoryAllocation = 2
        enumerator :: cudaErrorInitializationError = 3
        enumerator :: cudaErrorCudartUnloading = 4
        enumerator :: cudaErrorProfilerDisabled = 5
        enumerator :: cudaErrorProfilerNotInitialized = 6
        enumerator :: cudaErrorProfilerAlreadyStarted = 7
        enumerator :: cudaErrorProfilerAlreadyStopped = 8
        enumerator :: cudaErrorInvalidConfiguration = 9
        enumerator :: cudaErrorInvalidPitchValue = 12
        enumerator :: cudaErrorInvalidSymbol = 13
        enumerator :: cudaErrorInvalidHostPointer = 16
        enumerator :: cudaErrorInvalidDevicePointer = 17
        enumerator :: cudaErrorInvalidTexture = 18
        enumerator :: cudaErrorInvalidTextureBinding = 19
        enumerator :: cudaErrorInvalidChannelDescriptor = 20
        enumerator :: cudaErrorInvalidMemcpyDirection = 21
        enumerator :: cudaErrorAddressOfConstant = 22
        enumerator :: cudaErrorTextureFetchFailed = 23
        enumerator :: cudaErrorTextureNotBound = 24
        enumerator :: cudaErrorSynchronizationError = 25
        enumerator :: cudaErrorInvalidFilterSetting = 26
        enumerator :: cudaErrorInvalidNormSetting = 27
        enumerator :: cudaErrorMixedDeviceExecution = 28
        enumerator :: cudaErrorNotYetImplemented = 31
        enumerator :: cudaErrorMemoryValueTooLarge = 32
        enumerator :: cudaErrorInsufficientDriver = 35
        enumerator :: cudaErrorInvalidSurface = 37
        enumerator :: cudaErrorDuplicateVariableName = 43
        enumerator :: cudaErrorDuplicateTextureName = 44
        enumerator :: cudaErrorDuplicateSurfaceName = 45
        enumerator :: cudaErrorDevicesUnavailable = 46
        enumerator :: cudaErrorIncompatibleDriverContext = 49
        enumerator :: cudaErrorMissingConfiguration = 52
        enumerator :: cudaErrorPriorLaunchFailure = 53
        enumerator :: cudaErrorLaunchMaxDepthExceeded = 65
        enumerator :: cudaErrorLaunchFileScopedTex = 66
        enumerator :: cudaErrorLaunchFileScopedSurf = 67
        enumerator :: cudaErrorSyncDepthExceeded = 68
        enumerator :: cudaErrorLaunchPendingCountExceeded = 69
        enumerator :: cudaErrorInvalidDeviceFunction = 98
        enumerator :: cudaErrorNoDevice = 100
        enumerator :: cudaErrorInvalidDevice = 101
        enumerator :: cudaErrorStartupFailure = 127
        enumerator :: cudaErrorInvalidKernelImage = 200
        enumerator :: cudaErrorDeviceUninitialized = 201
        enumerator :: cudaErrorMapBufferObjectFailed = 205
        enumerator :: cudaErrorUnmapBufferObjectFailed = 206
        enumerator :: cudaErrorArrayIsMapped = 207
        enumerator :: cudaErrorAlreadyMapped = 208
        enumerator :: cudaErrorNoKernelImageForDevice = 209
        enumerator :: cudaErrorAlreadyAcquired = 210
        enumerator :: cudaErrorNotMapped = 211
        enumerator :: cudaErrorNotMappedAsArray = 212
        enumerator :: cudaErrorNotMappedAsPointer = 213
        enumerator :: cudaErrorECCUncorrectable = 214
        enumerator :: cudaErrorUnsupportedLimit = 215
        enumerator :: cudaErrorDeviceAlreadyInUse = 216
        enumerator :: cudaErrorPeerAccessUnsupported = 217
        enumerator :: cudaErrorInvalidPtx = 218
        enumerator :: cudaErrorInvalidGraphicsContext = 219
        enumerator :: cudaErrorNvlinkUncorrectable = 220
        enumerator :: cudaErrorJitCompilerNotFound = 221
        enumerator :: cudaErrorInvalidSource = 300
        enumerator :: cudaErrorFileNotFound = 301
        enumerator :: cudaErrorSharedObjectSymbolNotFound = 302
        enumerator :: cudaErrorSharedObjectInitFailed = 303
        enumerator :: cudaErrorOperatingSystem = 304
        enumerator :: cudaErrorInvalidResourceHandle = 400
        enumerator :: cudaErrorIllegalState = 401
        enumerator :: cudaErrorSymbolNotFound = 500
        enumerator :: cudaErrorNotReady = 600
        enumerator :: cudaErrorIllegalAddress = 700
        enumerator :: cudaErrorLaunchOutOfResources = 701
        enumerator :: cudaErrorLaunchTimeout = 702
        enumerator :: cudaErrorLaunchIncompatibleTexturing = 703
        enumerator :: cudaErrorPeerAccessAlreadyEnabled = 704
        enumerator :: cudaErrorPeerAccessNotEnabled = 705
        enumerator :: cudaErrorSetOnActiveProcess = 708
        enumerator :: cudaErrorContextIsDestroyed = 709
        enumerator :: cudaErrorAssert = 710
        enumerator :: cudaErrorTooManyPeers = 711
        enumerator :: cudaErrorHostMemoryAlreadyRegistered = 712
        enumerator :: cudaErrorHostMemoryNotRegistered = 713
        enumerator :: cudaErrorHardwareStackError = 714
        enumerator :: cudaErrorIllegalInstruction = 715
        enumerator :: cudaErrorMisalignedAddress = 716
        enumerator :: cudaErrorInvalidAddressSpace = 717
        enumerator :: cudaErrorInvalidPc = 718
        enumerator :: cudaErrorLaunchFailure = 719
        enumerator :: cudaErrorCooperativeLaunchTooLarge = 720
        enumerator :: cudaErrorNotPermitted = 800
        enumerator :: cudaErrorNotSupported = 801
        enumerator :: cudaErrorSystemNotReady = 802
        enumerator :: cudaErrorSystemDriverMismatch = 803
        enumerator :: cudaErrorCompatNotSupportedOnDevice = 804
        enumerator :: cudaErrorStreamCaptureUnsupported = 900
        enumerator :: cudaErrorStreamCaptureInvalidated = 901
        enumerator :: cudaErrorStreamCaptureMerge = 902
        enumerator :: cudaErrorStreamCaptureUnmatched = 903
        enumerator :: cudaErrorStreamCaptureUnjoined = 904
        enumerator :: cudaErrorStreamCaptureIsolation = 905
        enumerator :: cudaErrorStreamCaptureImplicit = 906
        enumerator :: cudaErrorCapturedEvent = 907
        enumerator :: cudaErrorStreamCaptureWrongThread = 908
        enumerator :: cudaErrorTimeout = 909
        enumerator :: cudaErrorGraphExecUpdateFailure = 910
        enumerator :: cudaErrorUnknown = 999
        enumerator :: cudaErrorApiFailureBase = 10000
    end enum

public :: cudaMemcpyHostToHost,     &
          cudaMemcpyHostToDevice,   &
          cudaMemcpyDeviceToHost,   &
          cudaMemcpyDeviceToDevice, &
          cudaMemcpyDefault
    enum, bind(C)
        enumerator :: cudaMemcpyHostToHost = 0
        enumerator :: cudaMemcpyHostToDevice = 1
        enumerator :: cudaMemcpyDeviceToHost = 2
        enumerator :: cudaMemcpyDeviceToDevice = 3
        enumerator :: cudaMemcpyDefault = 4
    end enum



public :: cudaEvent
#ifdef DTFFT_WITH_MOCK_ENABLED
    type :: cudaEvent
        real(real64) :: t
    end type cudaEvent
#else
    type, bind(C) :: cudaEvent
    !! CUDA event types
        type(c_ptr) :: event  !! Handle
    end type cudaEvent
#endif

    integer(c_int), parameter, public :: cudaEventDisableTiming = 2

public :: device_props
    type, bind(C) :: device_props
    !! GPU device properties obtained from cudaDeviceProp
        integer(c_int)    :: sm_count                   !! Number of multiprocessors on device (cudaDeviceProp.multiProcessorCount)
        integer(c_int)    :: max_threads_per_sm         !! Maximum resident threads per multiprocessor (cudaDeviceProp.maxThreadsPerMultiProcessor)
        integer(c_int)    :: max_blocks_per_sm          !! Maximum number of resident blocks per multiprocessor (cudaDeviceProp.maxBlocksPerMultiProcessor)
        integer(c_size_t) :: shared_mem_per_sm          !! Shared memory per multiprocessor (cudaDeviceProp.sharedMemPerMultiprocessor)
        integer(c_int)    :: max_threads_per_block      !! Maximum number of threads per block (cudaDeviceProp.maxThreadsPerBlock)
        integer(c_size_t) :: shared_mem_per_block       !! Shared memory available per block in bytes (cudaDeviceProp.sharedMemPerBlock)
        integer(c_int)    :: l2_cache_size              !! Size of L2 cache in bytes (cudaDeviceProp.l2CacheSize)
        integer(c_int)    :: compute_capability_major   !! Major compute capability (cudaDeviceProp.major)
        integer(c_int)    :: compute_capability_minor   !! Minor compute capability (cudaDeviceProp.minor)
    end type device_props

#ifndef DTFFT_WITH_MOCK_ENABLED
! Real CUDA Runtime interfaces

    interface
        function cudaStreamQuery(stream)                                    &
            result(cudaError_t)                                             &
            bind(C, name="cudaStreamQuery")
        !! Queries an asynchronous stream for completion status.
        import
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if all operations in stream have completed,
                                                        !! or `cudaErrorNotReady` if not.
        end function cudaStreamQuery
    endinterface

    interface
        function cudaStreamCreate(stream)                                   &
            result(cudaError_t)                                             &
            bind(C, name="cudaStreamCreate")
        !! Creates an asynchronous stream.
        import
            type(dtfft_stream_t)        :: stream       !! Pointer to the created stream
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream was created successfully,
                                                        !! or an error code if there was an issue.
        end function cudaStreamCreate
    end interface

    interface
        function cudaStreamDestroy(stream)                                  &
            result(cudaError_t)                                             &
            bind(C, name="cudaStreamDestroy")
        !! Destroys an asynchronous stream.
        import
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream was destroyed successfully,
                                                        !! or an error code if there was an issue.
        end function cudaStreamDestroy
    end interface

    interface
        function cudaStreamSynchronize(stream)                              &
            result(cudaError_t)                                             &
            bind(C, name="cudaStreamSynchronize")
        !! Waits for stream tasks to complete.
        import
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream tasks completed successfully,
                                                        !! or an error code if there was an issue.
        end function cudaStreamSynchronize
    end interface

    interface
        function cudaGetErrorString_c(errcode)                              &
            result(string)                                                  &
            bind(C, name="cudaGetErrorString")
        !! Returns the string representation of an error code.
        import
            integer(c_int), value :: errcode  !! Error code
            type(c_ptr)           :: string   !! Pointer to the error string
        end function cudaGetErrorString_c
    end interface

    interface
        function cudaMalloc(ptr, count)                                     &
            result(cudaError_t)                                             &
            bind(C, name="cudaMalloc")
        !! Allocates memory on the device.
        import
            type(c_ptr)               :: ptr          !! Pointer to allocated device memory
            integer(c_size_t), value  :: count        !! Requested allocation size in bytes
            integer(c_int)            :: cudaError_t  !! Returns `cudaSuccess` if memory was allocated successfully,
                                                        !! or `cudaErrorMemoryAllocation` if the memory could not be allocated.
        end function cudaMalloc
    end interface

    interface
        function cudaFree(ptr)                                              &
            result(cudaError_t)                                             &
            bind(C, name="cudaFree")
        !! Frees memory on the device.
        import
            type(c_ptr), value :: ptr         !! Pointer to memory to free
            integer(c_int)     :: cudaError_t !! Returns `cudaSuccess` if memory was freed successfully,
                                                !! or an error code if there was an issue.
        end function cudaFree
    end interface

    interface
        function cudaMemset(ptr, val, count)                                &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemset")
        !! Initializes or sets device memory to a value.
        import
            type(c_ptr),        value :: ptr          !! Pointer to device memory
            integer(c_int),     value :: val          !! Value to set
            integer(c_size_t),  value :: count        !! Size in bytes to set
            integer(c_int)            :: cudaError_t  !! Returns `cudaSuccess` if the memory was set successfully,
                                                        !! or an error code if there was an issue.
        end function cudaMemset
    end interface

    interface
        function cudaEventCreateWithFlags(event, flags)                     &
            result(cudaError_t)                                             &
            bind(C, name="cudaEventCreateWithFlags")
        !! Creates an event with the specified flags.
        import
            type(cudaEvent)             :: event        !! Event identifier
            integer(c_int),   value     :: flags        !! Flags for event creation
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the event was created successfully,
                                                        !! or an error code if there was an issue.
        end function cudaEventCreateWithFlags
    end interface

    interface
        function cudaEventRecord(event, stream)                             &
            result(cudaError_t)                                             &
            bind(C, name="cudaEventRecord")
        !! Records an event in a stream.
        import
            type(cudaEvent),      value :: event        !! Event identifier
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the event was recorded successfully,
                                                        !! or an error code if there was an issue.
        end function cudaEventRecord
    end interface

    interface
        function cudaStreamWaitEvent(stream, event, flags)                  &
            result(cudaError_t)                                             &
            bind(C, name="cudaStreamWaitEvent")
        !! Makes a stream wait on an event.
        import
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            type(cudaEvent),      value :: event        !! Event identifier
            integer(c_int),       value :: flags        !! Flags for the wait operation
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the stream is waiting successfully,
                                                        !! or an error code if there was an issue.
        end function cudaStreamWaitEvent
    end interface

    interface
        function cudaEventDestroy(event)                                    &
            result(cudaError_t)                                             &
            bind(C, name="cudaEventDestroy")
        !! Destroys an event.
        import
            type(cudaEvent), value :: event       !! Event identifier
            integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the event was destroyed successfully,
                                                    !! or an error code if there was an issue.
        end function cudaEventDestroy
    end interface

    interface
        function cudaEventCreate(event)                                     &
            result(cudaError_t)                                             &
            bind(C, name="cudaEventCreate")
        !! Creates an event.
        import
            type(cudaEvent) :: event        !! Event identifier
            integer(c_int)  :: cudaError_t  !! Returns `cudaSuccess` if the event was created successfully,
                                            !! or an error code if there was an issue.
        end function cudaEventCreate
    end interface

    interface
        function cudaEventSynchronize(event)                                &
            result(cudaError_t)                                             &
            bind(C, name="cudaEventSynchronize")
        !! Waits for an event to complete.
        import
            type(cudaEvent), value :: event         !! Event identifier
            integer(c_int)         :: cudaError_t   !! Returns `cudaSuccess` if the event completed successfully,
                                                    !! or an error code if there was an issue.
        end function cudaEventSynchronize
    end interface

    interface
        function cudaEventElapsedTime(time, start, end)                     &
            result(cudaError_t)                                             &
            bind(C, name="cudaEventElapsedTime")
        !! Computes the elapsed time between two events.
        import
            real(c_float)          :: time        !! Elapsed time in milliseconds
            type(cudaEvent), value :: start       !! Starting event
            type(cudaEvent), value :: end         !! Ending event
            integer(c_int)         :: cudaError_t !! Returns `cudaSuccess` if the elapsed time was computed successfully,
                                                    !! or an error code if there was an issue.
        end function cudaEventElapsedTime
    end interface

    interface cudaMemcpyAsync
        !! Copies data asynchronously between host and device.
        function cudaMemcpyAsync_ptr(dst, src, count, kdir, stream)         &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemcpyAsync")
        import
            type(c_ptr),          value :: dst          !! Destination pointer
            type(c_ptr),          value :: src          !! Source pointer
            integer(c_size_t),    value :: count        !! Size in bytes to copy
            integer(c_int),       value :: kdir         !! Direction of copy (host-to-device, device-to-host, etc.)
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was initiated successfully,
                                                        !! or an error code if there was an issue.
        end function cudaMemcpyAsync_ptr

        function cudaMemcpyAsync_r32(dst, src, count, kdir, stream)         &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemcpyAsync")
        import
            real(c_float)               :: dst          !! Destination array (32-bit float)
            real(c_float)               :: src          !! Source array (32-bit float)
            integer(c_size_t),    value :: count        !! Number of elements to copy
            integer(c_int),       value :: kdir         !! Direction of copy
            type(dtfft_stream_t), value :: stream       !! Stream identifier
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was initiated successfully,
                                                        !! or an error code if there was an issue.
        end function cudaMemcpyAsync_r32
    end interface

    interface cudaMemcpy
        !! Copies data synchronously between host and device.
        function cudaMemcpy_ptr(dst, src, count, kdir)                      &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemcpy")
        import
            type(c_ptr),          value :: dst          !! Destination pointer
            type(c_ptr),          value :: src          !! Source pointer
            integer(c_size_t),    value :: count        !! Size in bytes to copy
            integer(c_int),       value :: kdir         !! Direction of copy
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully,
                                                        !! or an error code if there was an issue.
        end function cudaMemcpy_ptr

        function cudaMemcpy_r64(dst, src, count, kdir)                      &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemcpy")
        import
            real(c_double)              :: dst(*)       !! Destination array (64-bit float)
            real(c_double)              :: src(*)       !! Source array (64-bit float)
            integer(c_size_t),    value :: count        !! Number of bytes to copy
            integer(c_int),       value :: kdir         !! Direction of copy
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully,
                                                        !! or an error code if there was an issue.
        end function cudaMemcpy_r64

        function cudaMemcpy_r32(dst, src, count, kdir)                      &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemcpy")
        import
            real(c_float)               :: dst(*)       !! Destination array (32-bit float)
            real(c_float)               :: src(*)       !! Source array (32-bit float)
            integer(c_size_t),    value :: count        !! Number of bytes to copy
            integer(c_int),       value :: kdir         !! Direction of copy
            integer(c_int)              :: cudaError_t  !! Returns `cudaSuccess` if the copy was completed successfully,
                                                        !! or an error code if there was an issue.
        end function cudaMemcpy_r32
    end interface

    interface
        function cudaGetDevice(num)                                         &
            result(cudaError_t)                                             &
            bind(C, name="cudaGetDevice")
        !! Returns the current device.
        import
            integer(c_int) :: num         !! Device number
            integer(c_int) :: cudaError_t !! Returns `cudaSuccess` if the device was retrieved successfully,
                                            !! or an error code if there was an issue.
        end function cudaGetDevice
    end interface

    interface
        function cudaGetDeviceCount(num)                                    &
            result(cudaError_t)                                             &
            bind(C, name="cudaGetDeviceCount")
        !! Returns the number of available devices.
        import
            integer(c_int) :: num         !! Number of devices
            integer(c_int) :: cudaError_t !! Returns `cudaSuccess` if the device count was retrieved successfully,
                                            !! or an error code if there was an issue.
        end function cudaGetDeviceCount
    end interface

    interface
        function cudaSetDevice(num)                                         &
            result(cudaError_t)                                             &
            bind(C, name="cudaSetDevice")
        !! Sets the current device.
        import
            integer(c_int), value :: num          !! Device number
            integer(c_int)        :: cudaError_t  !! Returns `cudaSuccess` if the device was set successfully,
                                                    !! or an error code if there was an issue.
        end function cudaSetDevice
    end interface

    interface
        function cudaMemGetInfo(free, total)                                &
            result(cudaError_t)                                             &
            bind(C, name="cudaMemGetInfo")
        !! Returns the amount of free and total memory on the device.
        import
            integer(c_size_t)   :: free         !! Free memory in bytes
            integer(c_size_t)   :: total        !! Total memory in bytes
            integer(c_int)      :: cudaError_t  !! Returns `cudaSuccess` if the memory information was retrieved successfully,
                                                !! or an error code if there was an issue.
        end function cudaMemGetInfo
    end interface

    interface
        function cudaDeviceSynchronize()                                    &
            result(cudaError_t)                                             &
            bind(C, name="cudaDeviceSynchronize")
        !! Synchronizes the device, blocking until all preceding tasks in all streams have completed.
        import
            integer(c_int)    :: cudaError_t  !! Returns `cudaSuccess` if syncronization was
                                                !! or an error code if there was an issue.
        end function cudaDeviceSynchronize
    end interface

    interface
        function cudaGetLastError()                                         &
            result(cudaError_t)                                             &
            bind(C, name="cudaGetLastError")
        !! Returns the last error from a runtime call.
        import
            integer(c_int)    :: cudaError_t  !! Returns `cudaSuccess` if no error was detected
                                                !! or an error code if there was an issue.
        end function cudaGetLastError
    end interface

    interface
        subroutine get_device_props(device, props)                          &
            bind(C, name="get_device_props_cuda")
        !! Returns the CUDA device properties for a given device number.
        import
            integer(c_int), value   :: device   !! Device number
            type(device_props)      :: props    !! GPU Properties
        end subroutine get_device_props
    end interface

#else
! Mock CUDA Runtime interfaces for CPU testing

    interface cudaMemcpyAsync
        module procedure cudaMemcpyAsync_ptr
        module procedure cudaMemcpyAsync_r32
    end interface cudaMemcpyAsync

    interface cudaMemcpy
        module procedure cudaMemcpy_ptr
        module procedure cudaMemcpy_r64
        module procedure cudaMemcpy_r32
    end interface cudaMemcpy

    ! C-wrappers for mock functions (public for C interoperability)
    public :: cudaDeviceSynchronize_c
    public :: cudaStreamCreate_c
    public :: cudaStreamSynchronize_c
    public :: cudaStreamDestroy_c
    public :: cudaMallocManaged_c
    public :: cudaFree_c
    public :: cudaMemset_c
    public :: cudaGetErrorString_c_wrapper

#endif


contains

#ifdef DTFFT_WITH_MOCK_ENABLED
  ! Mock implementations for CPU testing
  
    function cudaStreamQuery(stream) result(cudaError_t)
    !! Mock: Always returns cudaSuccess
        type(dtfft_stream_t), value :: stream
        integer(c_int)              :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaStreamQuery

    function cudaStreamCreate(stream) result(cudaError_t)
    !! Mock: Creates a dummy stream
        type(dtfft_stream_t)        :: stream
        integer(c_int)              :: cudaError_t
        stream%stream = c_null_ptr
        cudaError_t = cudaSuccess
    end function cudaStreamCreate

    function cudaStreamDestroy(stream) result(cudaError_t)
    !! Mock: Does nothing
        type(dtfft_stream_t), intent(in) :: stream
        integer(c_int)                   :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaStreamDestroy

    function cudaStreamSynchronize(stream) result(cudaError_t)
    !! Mock: Does nothing, just returns success
        type(dtfft_stream_t), intent(in) :: stream
        integer(c_int)                   :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaStreamSynchronize

    function cudaGetErrorString_c(errcode) result(string)
    !! Mock: Returns error string
        integer(c_int), intent(in) :: errcode
        type(c_ptr)                :: string
        character(len=20), target, save :: error_msg = "Mock CUDA Error"
        string = c_loc(error_msg)
    end function cudaGetErrorString_c

    function cudaMalloc(ptr, count) result(cudaError_t)
    !! Mock: Allocates memory on CPU using allocate
        use, intrinsic :: iso_c_binding
        type(c_ptr)                  :: ptr
        integer(c_size_t), intent(in) :: count
        integer(c_int)               :: cudaError_t

        ptr = mem_alloc_host(count)
        if( is_null_ptr(ptr) ) then
        cudaError_t = cudaErrorMemoryAllocation
        else
        cudaError_t = cudaSuccess
        end if
    end function cudaMalloc

    function cudaFree(ptr) result(cudaError_t)
    !! Mock: Frees memory allocated on CPU
        type(c_ptr), intent(in) :: ptr
        integer(c_int)          :: cudaError_t

        if ( is_null_ptr(ptr) ) then
        cudaError_t = cudaErrorInvalidValue
        return
        end if
        call mem_free_host(ptr)
        cudaError_t = cudaSuccess
    end function cudaFree

    function cudaMemset(ptr, val, count) result(cudaError_t)
    !! Mock: Sets memory on CPU
        type(c_ptr),        intent(in) :: ptr
        integer(c_int),     intent(in) :: val
        integer(c_size_t),  intent(in) :: count
        integer(c_int)                 :: cudaError_t
        integer(c_int8_t), pointer :: temp_array(:)
        integer(c_size_t) :: i

        if ( is_null_ptr(ptr) ) then
            cudaError_t = cudaErrorInvalidValue
            return
        end if

        call c_f_pointer(ptr, temp_array, [count])
        do i = 1, count
            temp_array(i) = int(val, c_int8_t)
        end do
        cudaError_t = cudaSuccess
    end function cudaMemset

    function cudaEventCreateWithFlags(event, flags) result(cudaError_t)
    !! Mock: Creates dummy event
        type(cudaEvent)            :: event
        integer(c_int), intent(in) :: flags
        integer(c_int)             :: cudaError_t
        event%t = 0.0_real64
        cudaError_t = cudaSuccess
    end function cudaEventCreateWithFlags

    function cudaEventRecord(event, stream) result(cudaError_t)
    !! Mock: Does nothing
        type(cudaEvent),      intent(inout) :: event
        type(dtfft_stream_t), intent(in)    :: stream
        integer(c_int)                      :: cudaError_t

        event%t = MPI_Wtime()
        cudaError_t = cudaSuccess
    end function cudaEventRecord

    function cudaStreamWaitEvent(stream, event, flags) result(cudaError_t)
    !! Mock: Does nothing
        type(dtfft_stream_t), intent(in) :: stream
        type(cudaEvent),      intent(in) :: event
        integer(c_int),       intent(in) :: flags
        integer(c_int)                   :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaStreamWaitEvent

    function cudaEventDestroy(event) result(cudaError_t)
    !! Mock: Does nothing
        type(cudaEvent), intent(in) :: event
        integer(c_int)              :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaEventDestroy

    function cudaEventCreate(event) result(cudaError_t)
    !! Mock: Creates dummy event
        type(cudaEvent) :: event
        integer(c_int)  :: cudaError_t
        event%t = 0.0_real64
        cudaError_t = cudaSuccess
    end function cudaEventCreate

    function cudaEventSynchronize(event) result(cudaError_t)
    !! Mock: Does nothing
        type(cudaEvent), intent(in) :: event
        integer(c_int)              :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaEventSynchronize

    function cudaEventElapsedTime(time, start, end) result(cudaError_t)
    !! Mock: Returns zero time
        real(c_float),   intent(out) :: time
        type(cudaEvent), intent(in)  :: start
        type(cudaEvent), intent(in)  :: end
        integer(c_int)               :: cudaError_t
        time = 1000.0_c_float * real(end%t - start%t, real32)
        cudaError_t = cudaSuccess
    end function cudaEventElapsedTime

    function cudaMemcpyAsync_ptr(dst, src, count, kdir, stream) result(cudaError_t)
    !! Mock: Synchronous copy on CPU
        type(c_ptr),          intent(in) :: dst
        type(c_ptr),          intent(in) :: src
        integer(c_size_t),    intent(in) :: count
        integer(c_int),       intent(in) :: kdir
        type(dtfft_stream_t), intent(in) :: stream
        integer(c_int)                   :: cudaError_t
        integer(c_int8_t), pointer :: src_array(:), dst_array(:)

        if ( is_null_ptr(src) .or. is_null_ptr(dst)) then
            cudaError_t = cudaErrorInvalidValue
            return
        endif

        call c_f_pointer(src, src_array, [count])
        call c_f_pointer(dst, dst_array, [count])
        dst_array(:) = src_array(:)
        cudaError_t = cudaSuccess
    end function cudaMemcpyAsync_ptr

    function cudaMemcpyAsync_r32(dst, src, count, kdir, stream) result(cudaError_t)
    !! Mock: Synchronous copy on CPU for r32
        real(c_float), target,  intent(out) :: dst
        real(c_float), target,  intent(in)  :: src
        integer(c_size_t),      intent(in)  :: count
        integer(c_int),         intent(in)  :: kdir
        type(dtfft_stream_t),   intent(in)  :: stream
        integer(c_int)                    :: cudaError_t

        cudaError_t = cudaMemcpyAsync(c_loc(dst), c_loc(src), count, kdir, stream)
    end function cudaMemcpyAsync_r32

    function cudaMemcpy_ptr(dst, src, count, kdir) result(cudaError_t)
    !! Mock: Synchronous copy on CPU
        type(c_ptr),       intent(in) :: dst
        type(c_ptr),       intent(in) :: src
        integer(c_size_t), intent(in) :: count
        integer(c_int),    intent(in) :: kdir
        integer(c_int)                :: cudaError_t

        cudaError_t = cudaMemcpyAsync(dst, src, count, kdir, NULL_STREAM)
    end function cudaMemcpy_ptr

    function cudaMemcpy_r64(dst, src, count, kdir) result(cudaError_t)
    !! Mock: Synchronous copy on CPU for r64
        real(c_double),    target,  intent(out) :: dst(*)
        real(c_double),    target,  intent(in)  :: src(*)
        integer(c_size_t),          intent(in)  :: count
        integer(c_int),             intent(in)  :: kdir
        integer(c_int)                          :: cudaError_t

        cudaError_t = cudaMemcpy(c_loc(dst), c_loc(src), count, kdir)
    end function cudaMemcpy_r64

    function cudaMemcpy_r32(dst, src, count, kdir) result(cudaError_t)
    !! Mock: Synchronous copy on CPU for r32
        real(c_float),    target,   intent(out) :: dst(*)
        real(c_float),    target,   intent(in)  :: src(*)
        integer(c_size_t),          intent(in)  :: count
        integer(c_int),             intent(in)  :: kdir
        integer(c_int)                          :: cudaError_t

        cudaError_t = cudaMemcpy(c_loc(dst), c_loc(src), count, kdir)
    end function cudaMemcpy_r32

    function cudaGetDevice(num) result(cudaError_t)
    !! Mock: Returns rank of MPI_COMM_WORLD
        integer(c_int), intent(out) :: num
        integer(c_int)              :: cudaError_t
        integer(int32) :: mpi_err

        call MPI_Comm_rank(MPI_COMM_WORLD, num, mpi_err)
        cudaError_t = cudaSuccess
    end function cudaGetDevice

    function cudaGetDeviceCount(num) result(cudaError_t)
    !! Mock: Returns size of MPI_COMM_TYPE_SHARED
        integer(c_int), intent(out) :: num
        integer(c_int)              :: cudaError_t
        TYPE_MPI_COMM  :: local_comm
        integer(int32) :: ierr

        call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr)
        call MPI_Comm_size(local_comm, num, ierr)
        call MPI_Comm_free(local_comm, ierr)
        cudaError_t = cudaSuccess
    end function cudaGetDeviceCount

    function cudaSetDevice(num) result(cudaError_t)
    !! Mock: Does nothing
        integer(c_int), intent(in) :: num
        integer(c_int)             :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaSetDevice

    function cudaMemGetInfo(free, total) result(cudaError_t)
    !! Mock: Returns dummy memory info
        integer(c_size_t)   :: free, total
        integer(c_int)      :: cudaError_t
        total = 8_c_size_t * 1024_c_size_t * 1024_c_size_t * 1024_c_size_t  ! 8 GB
        free = 4_c_size_t * 1024_c_size_t * 1024_c_size_t * 1024_c_size_t   ! 4 GB
        cudaError_t = cudaSuccess
    end function cudaMemGetInfo

    function cudaDeviceSynchronize() result(cudaError_t)
    !! Mock: Does nothing
        integer(c_int)    :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaDeviceSynchronize

    function cudaGetLastError() result(cudaError_t)
    !! Mock: Returns success
        integer(c_int)    :: cudaError_t
        cudaError_t = cudaSuccess
    end function cudaGetLastError

    subroutine get_device_props(device, props)
    !! Mock: Returns dummy device properties
        integer(c_int),    intent(in)  :: device
        type(device_props), intent(out) :: props

        props%sm_count                   = 108
        props%max_threads_per_sm         = 2048
        props%max_blocks_per_sm          = 32
        props%shared_mem_per_sm          = 164 * 1024
        props%max_threads_per_block      = 1024
        props%shared_mem_per_block       = 48 * 1024
        props%l2_cache_size              = 40960 * 1024
        props%compute_capability_major   = 8
        props%compute_capability_minor   = 0
    end subroutine get_device_props

#endif

    function cudaGetErrorString(errcode) result(string)
    !! Helper function that returns a string describing the given nvrtcResult code
    !! If the error code is not recognized, "unrecognized error code" is returned.
        integer(c_int),   intent(in)  :: errcode        !! CUDA Runtime Compilation API result code.
        character(len=:), allocatable :: string         !! Result string

#ifndef DTFFT_WITH_MOCK_ENABLED
        call string_c2f(cudaGetErrorString_c(errcode), string)
#else
        if (errcode == cudaSuccess) then
            allocate(string, source="cudaSuccess (mock)")
        else
            allocate(string, source="cudaError (mock)")
        end if
#endif
    end function cudaGetErrorString

#ifdef DTFFT_WITH_MOCK_ENABLED
    ! C-wrapper functions for mock CUDA functions
    function cudaDeviceSynchronize_c() result(cudaError_t) bind(C, name="cudaDeviceSynchronize")
        integer(c_int) :: cudaError_t
        cudaError_t = cudaDeviceSynchronize()
    end function cudaDeviceSynchronize_c

    function cudaStreamCreate_c(stream) result(cudaError_t) bind(C, name="cudaStreamCreate")
        type(dtfft_stream_t) :: stream
        integer(c_int)       :: cudaError_t
        cudaError_t = cudaStreamCreate(stream)
    end function cudaStreamCreate_c

    function cudaStreamSynchronize_c(stream) result(cudaError_t) bind(C, name="cudaStreamSynchronize")
        type(dtfft_stream_t), value :: stream
        integer(c_int)              :: cudaError_t
        cudaError_t = cudaStreamSynchronize(stream)
    end function cudaStreamSynchronize_c

    function cudaStreamDestroy_c(stream) result(cudaError_t) bind(C, name="cudaStreamDestroy")
        type(dtfft_stream_t), value :: stream
        integer(c_int)              :: cudaError_t
        cudaError_t = cudaStreamDestroy(stream)
    end function cudaStreamDestroy_c

    function cudaMallocManaged_c(ptr, count, flags) result(cudaError_t) bind(C, name="cudaMallocManaged")
        type(c_ptr)              :: ptr
        integer(c_size_t), value :: count
        integer(c_int), value    :: flags
        integer(c_int)           :: cudaError_t
        ! For mock version, just call regular cudaMalloc (ignore flags)
        cudaError_t = cudaMalloc(ptr, count)
    end function cudaMallocManaged_c

    function cudaFree_c(ptr) result(cudaError_t) bind(C, name="cudaFree")
        type(c_ptr), value :: ptr
        integer(c_int)     :: cudaError_t
        cudaError_t = cudaFree(ptr)
    end function cudaFree_c

    function cudaMemset_c(ptr, val, count) result(cudaError_t) bind(C, name="cudaMemset")
        type(c_ptr), value       :: ptr
        integer(c_int), value    :: val
        integer(c_size_t), value :: count
        integer(c_int)           :: cudaError_t
        cudaError_t = cudaMemset(ptr, val, count)
    end function cudaMemset_c

    function cudaGetErrorString_c_wrapper(errcode) result(string) bind(C, name="cudaGetErrorString")
        integer(c_int), value :: errcode
        type(c_ptr)           :: string
        character(len=50), target, save :: error_msg

        if (errcode == cudaSuccess) then
            error_msg = "cudaSuccess (mock)" // c_null_char
        else
            error_msg = "cudaError (mock)" // c_null_char
        end if
        string = c_loc(error_msg)
    end function cudaGetErrorString_c_wrapper
#endif

end module dtfft_interface_cuda_runtime