dtfft_kernel_device.F90 Source File


This file depends on

sourcefile~~dtfft_kernel_device.f90~~EfferentGraph sourcefile~dtfft_kernel_device.f90 dtfft_kernel_device.F90 sourcefile~dtfft_abstract_kernel.f90 dtfft_abstract_kernel.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90 dtfft_nvrtc_block_optimizer.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module_cache.f90 dtfft_nvrtc_module_cache.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_module_cache.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_config.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_module.f90 dtfft_nvrtc_module.F90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_module.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvrtc.f90 dtfft_interface_nvrtc.F90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_errors.f90

Files dependent on this one

sourcefile~~dtfft_kernel_device.f90~~AfferentGraph sourcefile~dtfft_kernel_device.f90 dtfft_kernel_device.F90 sourcefile~dtfft_transpose_handle_generic.f90 dtfft_transpose_handle_generic.F90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_transpose_handle_generic.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_kernel_device
!! This module defines [[kernel_device]] type and its type bound procedures.
!! It extends [[abstract_kernel]] type and implements its type bound procedures.
use iso_c_binding
use iso_fortran_env
use dtfft_abstract_kernel
use dtfft_config
use dtfft_interface_cuda,         only: MAX_KERNEL_ARGS, dim3, CUfunction, cuLaunchKernel
use dtfft_interface_cuda_runtime
use dtfft_nvrtc_block_optimizer,  only: N_CANDIDATES,                       &
                                        kernel_config,                      &
                                        generate_candidates,                &
                                        evaluate_analytical_performance,    &
                                        sort_candidates_by_score
use dtfft_nvrtc_module_cache
use dtfft_parameters
use dtfft_utils
#include "_dtfft_mpi.h"
#include "_dtfft_cuda.h"
#include "_dtfft_profile.h"
#include "_dtfft_private.h"
implicit none
private
public :: kernel_device

  integer(int32),   parameter, public :: DEF_TILE_SIZE = 32
    !! Default tile size

  type, extends(abstract_kernel) :: kernel_device
  !! Device kernel class
  private
    type(kernel_type_t)   :: internal_kernel_type     !! Actual kernel type used for execution, can be different from `kernel_type`
    type(CUfunction)      :: cuda_kernel              !! Pointer to CUDA kernel.
    integer(int32)        :: tile_size                !! Tile size used for this kernel
    integer(int32)        :: block_rows               !! Number of rows in each block processed by each thread
    integer(int64)        :: copy_bytes               !! Number of bytes to copy for `KERNEL_UNPACK_SIMPLE_COPY` kernel
  contains
    procedure :: create_private => create   !! Creates kernel
    procedure :: execute_private => execute !! Executes kernel
    procedure :: destroy_private => destroy !! Destroys kernel
  end type kernel_device

contains

  subroutine create(self, effort, base_storage, force_effort)
  !! Creates kernel
    class(kernel_device),     intent(inout) :: self             !! Device kernel class
    type(dtfft_effort_t),     intent(in)    :: effort           !! Effort level for generating transpose kernels
    integer(int64),           intent(in)    :: base_storage     !! Number of bytes needed to store single element
    logical,        optional, intent(in)    :: force_effort     !! Should effort be forced or not
    type(device_props)                      :: props              !! GPU architecture properties
    integer(int32)                          :: device_id          !! Device ID

    call self%destroy()

    if ( self%kernel_type == KERNEL_UNPACK_SIMPLE_COPY ) then
      self%is_created = .true.
      self%copy_bytes = base_storage * product(self%dims)
      return
    endif

    self%internal_kernel_type = self%kernel_type
    if ( self%kernel_type == KERNEL_UNPACK )                              &
      self%internal_kernel_type = KERNEL_UNPACK_PIPELINED
    if ( self%kernel_type == KERNEL_PERMUTE_BACKWARD_END )  &
      self%internal_kernel_type = KERNEL_PERMUTE_BACKWARD_END_PIPELINED

    CUDA_CALL( cudaGetDevice(device_id) )
    call get_device_props(device_id, props)
    if ( allocated( self%neighbor_data ) ) then
      call get_kernel(self%dims, self%internal_kernel_type, effort, base_storage, props,    &
                      self%tile_size, self%block_rows, self%cuda_kernel, force_effort=force_effort, neighbor_data=self%neighbor_data(:, 1))
    else
      call get_kernel(self%dims, self%internal_kernel_type, effort, base_storage, props,    &
                      self%tile_size, self%block_rows, self%cuda_kernel, force_effort=force_effort)
    endif
  end subroutine create

  subroutine execute(self, in, out, stream, neighbor)
  !! Executes kernel on stream
    class(kernel_device),       intent(inout) :: self           !! Device kernel class
    real(real32),    target,    intent(in)    :: in(:)          !! Device pointer
    real(real32),    target,    intent(inout) :: out(:)         !! Device pointer
    type(dtfft_stream_t),       intent(in)    :: stream         !! Stream to execute on
    integer(int32),   optional, intent(in)    :: neighbor       !! Source rank for pipelined unpacking
    integer(int32) :: nargs, neighbor_count, n
    integer(int32) :: args(MAX_KERNEL_ARGS)
    type(dim3) :: blocks, threads

    if ( self%kernel_type == KERNEL_UNPACK_SIMPLE_COPY ) then
      CUDA_CALL( cudaMemcpyAsync(c_loc(out), c_loc(in), self%copy_bytes, cudaMemcpyDeviceToDevice, stream) )
#ifdef DTFFT_DEBUG
      CUDA_CALL( cudaStreamSynchronize(stream) )
#endif
      return
    endif

    if( any(self%kernel_type == [KERNEL_PERMUTE_FORWARD, KERNEL_PERMUTE_BACKWARD, KERNEL_PERMUTE_BACKWARD_START]) ) then
      call get_kernel_launch_params(self%kernel_type, self%dims, self%tile_size, self%block_rows, blocks, threads)
      call get_kernel_args(self%kernel_type, self%dims, nargs, args)
      CUDA_CALL( cuLaunchKernel(self%cuda_kernel, c_loc(in), c_loc(out), blocks, threads, stream, nargs, args) )
#ifdef DTFFT_DEBUG
      CUDA_CALL( cudaStreamSynchronize(stream) )
#endif
      return
    endif

    if ( any(self%kernel_type == [KERNEL_UNPACK_PIPELINED, KERNEL_PERMUTE_BACKWARD_END_PIPELINED]) ) then
      call get_kernel_launch_params(self%kernel_type, self%neighbor_data(1:3, neighbor), self%tile_size, self%block_rows, blocks, threads )
      call get_kernel_args(self%kernel_type, self%dims, nargs, args, self%neighbor_data(:, neighbor))
      CUDA_CALL( cuLaunchKernel(self%cuda_kernel, c_loc(in), c_loc(out), blocks, threads, stream, nargs, args) )
#ifdef DTFFT_DEBUG
      CUDA_CALL( cudaStreamSynchronize(stream) )
#endif
      return
    endif

    neighbor_count = size(self%neighbor_data, dim=2)
    do n = 1, neighbor_count
      call get_kernel_launch_params(self%internal_kernel_type, self%neighbor_data(1:3, n), self%tile_size, self%block_rows, blocks, threads )
      call get_kernel_args(self%internal_kernel_type, self%dims, nargs, args, self%neighbor_data(:, n))
      CUDA_CALL( cuLaunchKernel(self%cuda_kernel, c_loc(in), c_loc(out), blocks, threads, stream, nargs, args) )
#ifdef DTFFT_DEBUG
      CUDA_CALL( cudaStreamSynchronize(stream) )
#endif
    enddo
  end subroutine execute

  subroutine destroy(self)
  !! Destroys kernel
    class(kernel_device), intent(inout) :: self !! Device kernel class

    if ( .not. self%is_created ) return
    if ( self%is_dummy .or. self%kernel_type == KERNEL_UNPACK_SIMPLE_COPY ) return
  end subroutine destroy

  subroutine get_kernel_args(kernel_type, dims, nargs, args, neighbor_data)
  !! Populates kernel arguments based on kernel type
    type(kernel_type_t),      intent(in)    :: kernel_type        !! Type of kernel
    integer(int32),           intent(in)    :: dims(:)            !! Local dimensions to process
    integer(int32),           intent(out)   :: nargs              !! Number of arguments set by this subroutine
    integer(int32),           intent(out)   :: args(MAX_KERNEL_ARGS)  !! Kernel arguments
    integer(int32), optional, intent(in)    :: neighbor_data(:)   !! Neighbor data for pipelined kernels

    nargs = 0
    nargs = nargs + 1;  args(nargs) = dims(1)
    nargs = nargs + 1;  args(nargs) = dims(2)
    if ( kernel_type == KERNEL_UNPACK_PIPELINED ) then
      nargs = nargs + 1;  args(nargs) = neighbor_data(1)
      nargs = nargs + 1;  args(nargs) = neighbor_data(2)
      nargs = nargs + 1;  args(nargs) = neighbor_data(4)
      nargs = nargs + 1;  args(nargs) = neighbor_data(5)
    endif
    if ( size(dims) == 2 .or. kernel_type == KERNEL_UNPACK_PIPELINED ) return

    if ( any(kernel_type == [KERNEL_PERMUTE_FORWARD, KERNEL_PERMUTE_BACKWARD, KERNEL_PERMUTE_BACKWARD_START]) ) then
      nargs = nargs + 1; args(nargs) = dims(3)
      return
    endif

    nargs = nargs + 1; args(nargs) = neighbor_data(1)
    nargs = nargs + 1; args(nargs) = neighbor_data(2)
    nargs = nargs + 1; args(nargs) = neighbor_data(3)
    nargs = nargs + 1; args(nargs) = neighbor_data(4)
    nargs = nargs + 1; args(nargs) = neighbor_data(5)
  end subroutine get_kernel_args

  subroutine get_kernel_launch_params(kernel_type, dims, tile_size, block_rows, blocks, threads)
  !! Computes kernel launch parameters based on kernel type and dimensions
    type(kernel_type_t),      intent(in)    :: kernel_type        !! Type of kernel
    integer(int32),           intent(in)    :: dims(:)            !! Local dimensions to process
    integer(int32),           intent(in)    :: tile_size          !! Size of the tile in shared memory
    integer(int32),           intent(in)    :: block_rows         !! Number of rows in each block
    type(dim3),               intent(out)   :: blocks             !! Number of blocks to launch
    type(dim3),               intent(out)   :: threads            !! Number of threads per block
    integer(int32) :: tile_dim, other_dim

    threads%x = tile_size
    threads%y = block_rows
    threads%z = 1

    if ( any(kernel_type == [KERNEL_PERMUTE_FORWARD, KERNEL_PERMUTE_BACKWARD_END_PIPELINED, KERNEL_UNPACK_PIPELINED]) ) then
      tile_dim = 2
      other_dim = 3
    else
      ! KERNEL_PERMUTE_BACKWARD_START or KERNEL_PERMUTE_BACKWARD
      tile_dim = 3
      other_dim = 2
    endif

    blocks%x = (dims(1) + tile_size - 1) / tile_size
    blocks%y = (dims(tile_dim) + tile_size - 1) / tile_size
    if ( size(dims) == 2 ) then
      blocks%z = 1
    else
      blocks%z = dims(other_dim)
    endif
  end subroutine get_kernel_launch_params

  subroutine get_kernel(dims, kernel_type, effort, base_storage, props, tile_size, block_rows, kernel, force_effort, neighbor_data)
  !! Compiles kernel and caches it. Returns compiled kernel.
    integer(int32),           intent(in)    :: dims(:)            !! Local dimensions to process
    type(kernel_type_t),      intent(in)    :: kernel_type        !! Type of kernel to build
    type(dtfft_effort_t),     intent(in)    :: effort             !! How thoroughly `dtFFT` searches for the optimal transpose kernel
    integer(int64),           intent(in)    :: base_storage       !! Number of bytes needed to store single element
    type(device_props),       intent(in)    :: props              !! GPU architecture properties
    integer(int32),           intent(out)   :: tile_size          !! Size of the tile in shared memory
    integer(int32),           intent(out)   :: block_rows         !! Number of rows in each block processed by each thread
    type(CUfunction),         intent(out)   :: kernel             !! Compiled kernel to return
    logical,        optional, intent(in)    :: force_effort       !! Should effort be forced or not
    integer(int32), optional, intent(in)    :: neighbor_data(:)   !! Neighbor data for pipelined kernels
    type(kernel_config)           :: candidates(N_CANDIDATES) !! Candidate kernel configurations
    type(kernel_config)           :: config                   !! Current candidate
    integer(int32)                :: num_candidates           !! Number of candidate configurations generated
    integer(int32)                :: i                        !! Loop index
    real(real32),     allocatable :: scores(:)                !! Scores for each candidate configuration
    integer(int32),   allocatable :: sorted(:)                !! Sorted indices of candidate configurations
    integer(int32)                :: tile_dim                 !! Tile dimension
    integer(int32)                :: other_dim                !! Dimension that is not part of shared memory
    integer(int32)                :: fixed_dims(3)            !! Dimensions fixed to the shared memory
    integer(int32)                :: ndims                    !! Number of dimensions
    integer(int32)                :: test_size                !! Number of test configurations to run
    integer(int32)                :: test_id                  !! Current test configuration ID
    integer(int32)                :: iter                     !! Loop index
    integer(int32)                :: best_kernel_id           !! Best kernel configuration ID
    type(c_ptr)                   :: in                       !! Input buffer
    type(c_ptr)                   :: out                      !! Output buffer
    type(dim3)                    :: blocks                   !! Blocks configuration
    type(dim3)                    :: threads                  !! Threads configuration
    type(cudaEvent)               :: timer_start              !! Timer start event
    type(cudaEvent)               :: timer_stop               !! Timer stop event
    real(real32)                  :: execution_time           !! Execution time
    real(real32)                  :: best_time                !! Best execution time
    type(dtfft_stream_t)          :: stream                   !! CUDA stream for kernel execution
    real(real32)                  :: bandwidth                !! Bandwidth for kernel execution
    integer(int32)                :: n_iters                  !! Number of iterations to perform when testing kernel
    integer(int32)                :: n_warmup_iters           !! Number of warmup iterations to perform before testing kernel
    logical                       :: force_effort_            !! Should effort be forced or not
    character(len=:), allocatable :: global_phase             !! Global phase name for profiling
    character(len=:), allocatable :: local_phase              !! Local phase name for profiling
    integer(int32)                :: nargs                    !! Number of kernel arguments
    integer(int32)                :: args(MAX_KERNEL_ARGS)    !! Kernel arguments


    if ( any(kernel_type == [KERNEL_PERMUTE_FORWARD, KERNEL_PERMUTE_BACKWARD_END_PIPELINED, KERNEL_UNPACK_PIPELINED]) ) then
      tile_dim = 2
      other_dim = 3
    else
      ! KERNEL_PERMUTE_BACKWARD_START or KERNEL_PERMUTE_BACKWARD
      tile_dim = 3
      other_dim = 2
    endif

    ndims = size(dims)
    fixed_dims(:) = 1
    fixed_dims(1:ndims) = dims(1:ndims)
    if ( is_unpack_kernel(kernel_type) ) fixed_dims(1:ndims) = neighbor_data(1:ndims)

    call generate_candidates(fixed_dims, tile_dim, other_dim, base_storage, props, candidates, num_candidates)
    allocate(scores(num_candidates), sorted(num_candidates))
    do i = 1, num_candidates
      scores(i) = evaluate_analytical_performance(fixed_dims, tile_dim, other_dim, kernel_type, candidates(i), props, base_storage, neighbor_data)
    enddo
    call sort_candidates_by_score(scores, num_candidates, sorted)

    call create_nvrtc_module(ndims, kernel_type, base_storage, candidates(1:num_candidates), props)

    force_effort_ = .false.; if( present(force_effort) ) force_effort_ = force_effort

    if ( (effort == DTFFT_ESTIMATE .and. force_effort_) .or.                                                                                &
          .not. ( (effort == DTFFT_PATIENT .and. get_conf_kernel_optimization_enabled()) .or. get_conf_forced_kernel_optimization()) ) then
      config = candidates(sorted(1))
      tile_size = config%tile_size
      block_rows = config%block_rows
      kernel = get_kernel_instance(ndims, kernel_type, base_storage, tile_size, block_rows)
      deallocate(scores, sorted)
      return
    endif

    CUDA_CALL( cudaMalloc(in, base_storage * product(dims)) )
    CUDA_CALL( cudaMalloc(out, base_storage * product(dims)) )
    CUDA_CALL( cudaEventCreate(timer_start) )
    CUDA_CALL( cudaEventCreate(timer_stop) )
    stream = get_conf_stream()

    global_phase = "Testing nvRTC kernel: '"//get_kernel_string(kernel_type)//"' perfomances..."
    PHASE_BEGIN(global_phase, COLOR_AUTOTUNE)
    WRITE_INFO(global_phase)

    n_warmup_iters = get_conf_measure_warmup_iters()
    n_iters = get_conf_measure_iters()

    best_time = MAX_REAL32
    test_size = get_conf_configs_to_test()
    if ( test_size > num_candidates ) test_size = num_candidates

    do test_id = 1, test_size
      config = candidates(sorted(test_id))
      tile_size = config%tile_size
      block_rows = config%block_rows

      call get_kernel_launch_params(kernel_type, fixed_dims, tile_size, block_rows, blocks, threads)
      call get_kernel_args(kernel_type, dims, nargs, args, neighbor_data)

      kernel = get_kernel_instance(ndims, kernel_type, base_storage, tile_size, block_rows)

      local_phase = "Testing block: "//to_str(tile_size)//"x"//to_str(block_rows)
      REGION_BEGIN(local_phase, COLOR_AUTOTUNE2)
      WRITE_INFO("    "//local_phase)

      REGION_BEGIN("Warmup", COLOR_TRANSPOSE)
      do iter = 1, n_warmup_iters
        CUDA_CALL( cuLaunchKernel(kernel, in, out, blocks, threads, stream, nargs, args) )
      enddo
      CUDA_CALL( cudaStreamSynchronize(stream) )
      REGION_END("Warmup")

      REGION_BEGIN("Measure", COLOR_EXECUTE)
      CUDA_CALL( cudaEventRecord(timer_start, stream) )
      do iter = 1, n_iters
        CUDA_CALL( cuLaunchKernel(kernel, in, out, blocks, threads, stream, nargs, args) )
      enddo

      CUDA_CALL( cudaEventRecord(timer_stop, stream) )
      CUDA_CALL( cudaEventSynchronize(timer_stop) )
      REGION_END("Measure")
      CUDA_CALL( cudaEventElapsedTime(execution_time, timer_start, timer_stop) )
      execution_time = execution_time / real(n_iters, real32)
      bandwidth = 2.0 * 1000.0 * real(base_storage * product(fixed_dims), real32) / real(1024 * 1024 * 1024, real32) / execution_time
      WRITE_INFO("        Average execution time = "//to_str(real(execution_time, real64))//" [ms]")
      WRITE_INFO("        Bandwidth = "//to_str(bandwidth)//" [GB/s]")

      if ( execution_time < best_time ) then
        best_time = execution_time
        best_kernel_id = test_id
      endif
      REGION_END(local_phase)
    enddo
    config = candidates(sorted(best_kernel_id))
    PHASE_END(global_phase)
    tile_size = config%tile_size
    block_rows = config%block_rows
    kernel = get_kernel_instance(ndims, kernel_type, base_storage, tile_size, block_rows)
    WRITE_INFO("  Best configuration is: "//to_str(tile_size)//"x"//to_str(block_rows))

    CUDA_CALL( cudaEventDestroy(timer_start) )
    CUDA_CALL( cudaEventDestroy(timer_stop) )
    CUDA_CALL( cudaFree(in) )
    CUDA_CALL( cudaFree(out) )
    deallocate(scores, sorted)
    deallocate(global_phase, local_phase)
  end subroutine get_kernel
end module dtfft_kernel_device