dtfft_transpose_plan_cuda.F90 Source File


This file depends on

sourcefile~~dtfft_transpose_plan_cuda.f90~~EfferentGraph sourcefile~dtfft_transpose_plan_cuda.f90 dtfft_transpose_plan_cuda.F90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_transpose_plan.f90 dtfft_abstract_transpose_plan.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvrtc.f90 dtfft_interface_nvrtc.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_kernel.f90 dtfft_nvrtc_kernel.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_cuda.f90 dtfft_transpose_handle_cuda.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_transpose_handle_cuda.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nccl.f90 dtfft_interface_nccl.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_interface_nvshmem.f90 dtfft_interface_nvshmem.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_interface_cufft.f90 dtfft_interface_cufft.F90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cufft.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cufft.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cufft.f90->sourcefile~dtfft_utils.f90

Files dependent on this one

sourcefile~~dtfft_transpose_plan_cuda.f90~~AfferentGraph sourcefile~dtfft_transpose_plan_cuda.f90 dtfft_transpose_plan_cuda.F90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_cuda.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
#include "dtfft.f03"
module dtfft_transpose_plan_cuda
!! This module describes [[transpose_plan_cuda]] class
use iso_fortran_env
use iso_c_binding
use dtfft_interface_cuda_runtime
use dtfft_interface_cuda,                 only: load_cuda
use dtfft_interface_nvrtc,                only: load_nvrtc
use dtfft_config
use dtfft_abstract_transpose_plan,        only: abstract_transpose_plan, create_cart_comm, alloc_mem, free_mem
use dtfft_abstract_backend,               only: backend_helper
use dtfft_nvrtc_kernel,                   only: clean_unused_cache
use dtfft_parameters
use dtfft_pencil,                         only: pencil, get_local_sizes
use dtfft_transpose_handle_cuda,          only: transpose_handle_cuda
use dtfft_utils
#include "dtfft_mpi.h"
#include "dtfft_profile.h"
#include "dtfft_cuda.h"
#include "dtfft_private.h"
implicit none
private
public :: transpose_plan_cuda

  real(real32),    parameter :: MaxR4P  =  huge(1._real32)
    !! Maximum value of real32

  type, extends(abstract_transpose_plan) :: transpose_plan_cuda
  !! CUDA Transpose Plan
  private
    type(dtfft_stream_t)                      :: stream           !! CUDA stream
    type(c_ptr)                               :: aux              !! Auxiliary memory
    real(real32),                pointer      :: paux(:)          !! Pointer to auxiliary memory
    logical                                   :: is_aux_alloc     !! Is auxiliary memory allocated
    type(transpose_handle_cuda), allocatable  :: fplans(:)        !! Forward transposition plans
    type(transpose_handle_cuda), allocatable  :: bplans(:)        !! Backward transposition plans
  contains
    procedure :: create_private => create_cuda      !! Creates CUDA transpose plan
    procedure :: execute_private => execute_cuda    !! Executes single transposition
    procedure :: destroy => destroy_cuda            !! Destroys CUDA transpose plan
  end type transpose_plan_cuda

contains

  integer(int32) function create_cuda(self, dims, transposed_dims, base_comm, comm_dims, effort, base_dtype, base_storage, is_custom_cart_comm, cart_comm, comms, pencils)
  !! Creates CUDA transpose plan
    class(transpose_plan_cuda),     intent(inout) :: self                 !! GPU transpose plan
    integer(int32),                 intent(in)    :: dims(:)              !! Global sizes of the transform requested
    integer(int32),                 intent(in)    :: transposed_dims(:,:) !! Transposed dimensions
    TYPE_MPI_COMM,                  intent(in)    :: base_comm            !! Base communicator
    integer(int32),                 intent(in)    :: comm_dims(:)         !! Number of processors in each dimension
    type(dtfft_effort_t),           intent(in)    :: effort               !! How thoroughly `dtFFT` searches for the optimal plan
    TYPE_MPI_DATATYPE,              intent(in)    :: base_dtype           !! Base MPI_Datatype
    integer(int64),                 intent(in)    :: base_storage         !! Number of bytes needed to store single element
    logical,                        intent(in)    :: is_custom_cart_comm  !! is custom Cartesian communicator provided by user
    TYPE_MPI_COMM,                  intent(out)   :: cart_comm            !! Cartesian communicator
    TYPE_MPI_COMM,                  intent(out)   :: comms(:)             !! Array of 1d communicators
    type(pencil),                   intent(out)   :: pencils(:)           !! Data distributing meta
    integer(int8) :: n_transpose_plans
    integer(int8) :: d, ndims!, b
    integer(int32) :: comm_size, ierr
    ! integer(cuda_count_kind) :: free, total
    integer(int32), allocatable :: best_decomposition(:)
    ! real(real32) :: execution_time
    logical :: pencils_created
    real(real64) :: ts, te

    create_cuda = DTFFT_SUCCESS
    if ( .not.get_mpi_enabled() .and. .not.get_nccl_enabled() .and. .not.get_nvshmem_enabled() .and. effort == DTFFT_PATIENT) then
      create_cuda = DTFFT_ERROR_GPU_BACKENDS_DISABLED
      return
    endif
    CHECK_CALL( load_cuda(), create_cuda )
    CHECK_CALL( load_nvrtc(), create_cuda )

    ndims = size(dims, kind=int8)
    allocate( best_decomposition(ndims) )

    self%stream = get_user_stream()
    self%backend = get_user_gpu_backend()

    best_decomposition(:) = comm_dims(:)
    call MPI_Comm_size(base_comm, comm_size, ierr)
    if ( comm_size == 1 ) self%backend = BACKEND_NOT_SET

    pencils_created = .false.
    if ( ndims == 2 .or. is_custom_cart_comm .or. self%is_z_slab ) then
      pencils_created = .true.
      call create_cart_comm(base_comm, best_decomposition, cart_comm, comms)
      do d = 1, ndims
        call pencils(d)%create(ndims, d, transposed_dims(:,d), comms)
      enddo
    endif

    ts = MPI_Wtime()

    if ( effort == DTFFT_PATIENT .and. comm_size > 1) then
      if ( pencils_created ) then
        call run_autotune_backend(comms, cart_comm, pencils, base_storage, self%stream, self%is_z_slab, best_backend=self%backend)
      else
        call autotune_grid_decomposition(dims, transposed_dims, base_comm, base_storage, self%stream, best_decomposition, best_backend=self%backend)
      endif
    else if ( ndims == 3                                &
      .and. .not.is_custom_cart_comm                    &
      .and. .not.self%is_z_slab                         &
      .and. effort == DTFFT_MEASURE                     &
      .and. comm_size > 1 ) then

      call autotune_grid_decomposition(dims, transposed_dims, base_comm, base_storage,self%stream, best_decomposition, backend=self%backend)
    endif
    te = MPI_Wtime()

    if ( effort%val >= DTFFT_MEASURE%val .and. ndims > 2 .and. comm_size > 1 ) then
      WRITE_INFO(repeat("*", 50))
      if ( self%is_z_slab ) then
        WRITE_INFO("Skipped search of MPI processor grid due to Z-slab optimization enabled")
      else if ( is_custom_cart_comm ) then
        WRITE_INFO("Skipped search of MPI processor grid due to custom grid provided")
      else
        WRITE_INFO("DTFFT_MEASURE: Selected MPI processor grid 1x"//int_to_str(best_decomposition(2))//"x"//int_to_str(best_decomposition(3)))
      endif
    endif
    if ( effort == DTFFT_PATIENT .and. comm_size > 1 ) then
      WRITE_INFO("DTFFT_PATIENT: Selected backend is "//dtfft_get_backend_string(self%backend))
    endif
    if ( effort%val >= DTFFT_MEASURE%val .and. comm_size > 1 ) then
      WRITE_INFO("Time spent on autotune: "//double_to_str(te - ts)//" [s]")
    endif

    if ( .not.pencils_created ) then
      call create_cart_comm(base_comm, best_decomposition, cart_comm, comms)
      do d = 1, ndims
        call pencils(d)%create(ndims, d, transposed_dims(:,d), comms)
      enddo
    endif
    n_transpose_plans = ndims - 1_int8; if( self%is_z_slab ) n_transpose_plans = n_transpose_plans + 1_int8
    allocate( self%fplans(n_transpose_plans), self%bplans(n_transpose_plans) )

    call self%helper%create(cart_comm, comms, is_backend_nccl(self%backend), pencils)

    do d = 1_int8, ndims - 1_int8
      call self%fplans(d)%create(self%helper, pencils(d), pencils(d + 1), base_storage, self%backend)
      call self%bplans(d)%create(self%helper, pencils(d + 1), pencils(d), base_storage, self%backend)
    enddo
    if ( self%is_z_slab ) then
      call self%fplans(3)%create(self%helper, pencils(1), pencils(3), base_storage, self%backend)
      call self%bplans(3)%create(self%helper, pencils(3), pencils(1), base_storage, self%backend)
    endif
    self%is_aux_alloc = alloc_and_set_aux(self%helper, self%backend, cart_comm, self%aux, self%paux, self%fplans, self%bplans)

    call clean_unused_cache()
    deallocate( best_decomposition )
    create_cuda = DTFFT_SUCCESS
  end function create_cuda

  subroutine execute_cuda(self, in, out, transpose_type)
  !! Executes single transposition
    class(transpose_plan_cuda),   intent(inout) :: self           !! Transposition class
    real(real32),                 intent(inout) :: in(:)          !! Incoming buffer
    real(real32),                 intent(inout) :: out(:)         !! Resulting buffer
    type(dtfft_transpose_t),      intent(in)    :: transpose_type !! Type of transpose to execute

    if ( transpose_type%val > 0 ) then
      call self%fplans(transpose_type%val)%execute(in, out, self%stream, self%paux)
    else
      call self%bplans(abs(transpose_type%val))%execute(in, out, self%stream, self%paux)
    endif
  end subroutine execute_cuda

  subroutine destroy_cuda(self)
  !! Destroys transposition plans
    class(transpose_plan_cuda),    intent(inout) :: self         !! Transposition class
    integer(int8) :: i
    integer(int32) :: ierr

    if ( self%is_aux_alloc ) then
      call self%mem_free(self%aux, ierr)
      self%paux => null()
      self%is_aux_alloc = .false.
    endif

    do i = 1, size(self%bplans, kind=int8)
      call self%fplans(i)%destroy()
      call self% bplans(i)%destroy()
    enddo

    call self%helper%destroy()
    deallocate(self%fplans)
    deallocate(self% bplans)
  end subroutine destroy_cuda

  subroutine autotune_grid_decomposition(dims, transposed_dims, base_comm, base_storage, stream, best_decomposition, backend, min_execution_time, best_backend)
  !! Runs through all possible grid decompositions and selects the best one based on the lowest average execution time
    integer(int32),                   intent(in)    :: dims(:)
      !! Global sizes of the transform requested
    integer(int32),                   intent(in)    :: transposed_dims(:,:)
      !! Transposed dimensions
    TYPE_MPI_COMM,                    intent(in)    :: base_comm
      !! 3D comm
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    type(dtfft_stream_t),             intent(in)    :: stream
      !! Stream to use
    integer(int32),                   intent(out)   :: best_decomposition(:)
      !! Best decomposition found
    type(dtfft_backend_t),  optional, intent(in)    :: backend
      !! GPU Backend to test. Should be passed only when effort is `DTFFT_ESTIMATE` or `DTFFT_MEASURE`
    real(real32),           optional, intent(out)   :: min_execution_time
      !! Elapsed time for best plan selected
    type(dtfft_backend_t),  optional, intent(out)   :: best_backend
      !! Best backend selected
    integer(int8)   :: ndims
    type(dtfft_backend_t) :: best_backend_
    integer(int32)  :: comm_size, square_root, i, current_timer, k, ierr
    real(real32) :: current_time, elapsed_time
    real(real32), allocatable :: timers(:)
    integer(int32),   allocatable :: decomps(:,:)
    type(dtfft_backend_t), allocatable :: backends(:)

    call MPI_Comm_size(base_comm, comm_size, ierr)
    ndims = size(dims, kind=int8)

    square_root = int(sqrt(real(comm_size, real64))) + 1
    allocate(timers(2 * square_root))
    allocate(decomps(2, 2 * square_root))
    allocate(backends(2 * square_root))

    current_timer = 0
    do i = 1, square_root - 1
      if ( mod( comm_size, i ) /= 0 ) cycle

      call autotune_grid(dims, transposed_dims, base_comm, [1, i, comm_size / i], base_storage, .false., stream, backend=backend, best_time=current_time,best_backend=best_backend_)
      if ( current_time > 0.0 ) then
        current_timer = current_timer + 1
        timers(current_timer) = current_time
        decomps(1, current_timer) = i
        decomps(2, current_timer) = comm_size / i
        backends(current_timer) = best_backend_
      endif
      if ( i /= comm_size / i) then
        call autotune_grid(dims, transposed_dims, base_comm, [1, comm_size / i, i], base_storage, .false., stream, backend=backend, best_time=current_time,best_backend=best_backend_)
        if ( current_time > 0.0 ) then
          current_timer = current_timer + 1
          timers(current_timer) = current_time
          decomps(1, current_timer) = comm_size / i
          decomps(2, current_timer) = i
          backends(current_timer) = best_backend_
        endif
      endif
    enddo

    elapsed_time = MaxR4P
    k = 1
    do i = 1, current_timer
      if ( timers(i) < elapsed_time ) then
        elapsed_time = timers(i)
        k = i
      endif
    enddo

    best_decomposition(1) = 1
    best_decomposition(2) = decomps(1, k)
    best_decomposition(3) = decomps(2, k)
    if ( present(best_backend) ) best_backend = backends(k)
    if ( present(min_execution_time) ) min_execution_time = elapsed_time
    deallocate( timers, decomps, backends )
  end subroutine autotune_grid_decomposition

  subroutine autotune_grid(dims, transposed_dims, base_comm, comm_dims, base_storage, is_z_slab, stream, backend, best_time, best_backend)
  !! Creates cartesian grid and runs various backends on it. Can return best backend and execution time
    integer(int32),                   intent(in)    :: dims(:)
      !! Global sizes of the transform requested
    integer(int32),                   intent(in)    :: transposed_dims(:,:)
      !! Transposed dimensions
    TYPE_MPI_COMM,                    intent(in)    :: base_comm
      !! Basic communicator to create 3d grid from
    integer(int32),                   intent(in)    :: comm_dims(:)
      !! Number of processors in each dimension
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    logical,                          intent(in)    :: is_z_slab
      !! Is Z-slab optimization enabled
    type(dtfft_stream_t),             intent(in)    :: stream
      !! Stream to use
    type(dtfft_backend_t),  optional, intent(in)    :: backend
      !! GPU Backend to test. Should be passed only when effort is `DTFFT_ESTIMATE` or `DTFFT_MEASURE`
    type(dtfft_backend_t),  optional, intent(out)   :: best_backend
      !! Best backend selected for the grid
    real(real32),           optional, intent(out)   :: best_time
      !! Elapsed time for best plan selected
    type(pencil), allocatable :: pencils(:)
    character(len=:),             allocatable   :: phase_name           !! Caliper phase name
    integer(int8) :: d, ndims
    TYPE_MPI_COMM, allocatable :: comms(:)
    TYPE_MPI_COMM :: cart_comm
    integer(int32) :: mpi_ierr

    best_time = -1.0
    ndims = size(dims, kind=int8)

    if ( ndims == 3 ) then
      if ( comm_dims(2) > dims(2) .or. comm_dims(3) > dims(3) ) return
      allocate( phase_name, source = "Testing grid 1x"//int_to_str(comm_dims(2))//"x"//int_to_str(comm_dims(3)) )
    else
      allocate( phase_name, source = "Testing grid 1x"//int_to_str(comm_dims(2)) )
    endif

    WRITE_INFO("")
    WRITE_INFO(phase_name)
    PHASE_BEGIN(phase_name, COLOR_AUTOTUNE)

    allocate( comms(ndims), pencils(ndims) )
    call create_cart_comm(base_comm, comm_dims, cart_comm, comms)
    do d = 1, ndims
      call pencils(d)%create(ndims, d, transposed_dims(:,d), comms)
    enddo

    ! elapsed_time = autotune_backend_id(comms, cart_comm, pencils, base_storage, backend, stream)

    call run_autotune_backend(comms, cart_comm, pencils, base_storage, stream, is_z_slab, backend=backend, best_time=best_time, best_backend=best_backend)

    do d = 1, ndims
      call pencils(d)%destroy()
      call MPI_Comm_free(comms(d), mpi_ierr)
    enddo
    call MPI_Comm_free(cart_comm, mpi_ierr)
    deallocate( comms, pencils )
    PHASE_END(phase_name)
  end subroutine autotune_grid

  subroutine run_autotune_backend(comms, cart_comm, pencils, base_storage, stream, is_z_slab, backend, best_time, best_backend)
  !! Runs autotune for all backends
    TYPE_MPI_COMM,                    intent(in)    :: comms(:)
      !! 1D comms
    TYPE_MPI_COMM,                    intent(in)    :: cart_comm
      !! 3D Cartesian comm
    type(pencil),                     intent(in)    :: pencils(:)
      !! Source meta
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    type(dtfft_stream_t),             intent(in)    :: stream
      !! Stream to use
    logical,                          intent(in)    :: is_z_slab
      !! Is Z-slab optimization enabled
    type(dtfft_backend_t),  optional, intent(in)    :: backend
      !! GPU Backend to test. Should be passed only when effort is `DTFFT_ESTIMATE` or `DTFFT_MEASURE`
    real(real32),           optional, intent(out)   :: best_time
      !! Elapsed time for best backend
    type(dtfft_backend_t),  optional, intent(out)   :: best_backend
      !! Best backend selected
    type(dtfft_backend_t),  allocatable :: backends_to_run(:)
    type(dtfft_backend_t) :: current_backend_id, best_backend_
    logical :: is_udb !! Used defined backend
    real(real32) :: execution_time, min_execution_time, max_execution_time, avg_execution_time, best_time_, total_time
    integer(int32) :: iter, comm_size, mpi_ierr, b, ierr
    type(transpose_handle_cuda),  allocatable   :: plans(:)
    integer(int8) :: i, n_transpose_plans
    type(c_ptr) :: in, out, aux
    real(real32), pointer :: pin(:), pout(:), paux(:)
    logical :: is_aux_alloc
    ! , need_aux
    integer(int64)         :: alloc_size
    type(cudaEvent) :: timer_start, timer_stop
    character(len=:), allocatable :: testing_phase
    type(backend_helper)                      :: helper
    integer(int32) :: n_warmup_iters, n_iters
    integer(int64) :: min_buffer_size
    ! integer(int64)                                :: scaler         !! Scaling data amount to float size
    ! integer(cuda_count_kind) :: free, total

    if ( present(backend) ) then
      allocate( backends_to_run(1) )
      backends_to_run(1) = backend
      is_udb = .true.
    else
      allocate(backends_to_run(size(VALID_GPU_BACKENDS(2:))))
      do b = 1, size(VALID_GPU_BACKENDS(2:))
        backends_to_run(b) = VALID_GPU_BACKENDS(b + 1)
      enddo
      is_udb = .false.
    endif
    best_backend_ = backends_to_run(1)

    if ( is_z_slab ) then
      n_transpose_plans = 1
    else
      n_transpose_plans = size(pencils, kind=int8) - 1_int8
    endif

    allocate( plans(2 * n_transpose_plans) )

    call get_local_sizes(pencils, alloc_size=alloc_size)
    alloc_size = alloc_size * base_storage
    min_buffer_size = alloc_size / FLOAT_STORAGE_SIZE

    CUDA_CALL( "cudaEventCreate", cudaEventCreate(timer_start) )
    CUDA_CALL( "cudaEventCreate", cudaEventCreate(timer_stop) )

    call MPI_Comm_size(cart_comm, comm_size, mpi_ierr)

    call helper%create(cart_comm, comms, any(is_backend_nccl(backends_to_run)), pencils)

    n_warmup_iters = get_iters_from_env(.true.)
    n_iters = get_iters_from_env(.false.)

    best_time_ = MaxR4P

    do b = 1, size(backends_to_run)
      current_backend_id = backends_to_run(b)

      if ( (is_backend_pipelined(current_backend_id) .and. .not.get_pipelined_enabled()         &
            .or.is_backend_mpi(current_backend_id) .and. .not.get_mpi_enabled()                 &
            .or.is_backend_nvshmem(current_backend_id) .and. .not.get_nvshmem_enabled())        &
            .and. .not.is_udb) cycle

      if ( is_z_slab ) then
        call plans(1)%create(helper, pencils(1), pencils(3), base_storage, current_backend_id)
        call plans(2)%create(helper, pencils(3), pencils(1), base_storage, current_backend_id)
      else
        do i = 1, n_transpose_plans
          call plans(i)%create(helper, pencils(i), pencils(i + 1), base_storage, current_backend_id)
          call plans(i + n_transpose_plans)%create(helper, pencils(i + 1), pencils(i), base_storage, current_backend_id)
        enddo
      endif

      call alloc_mem(helper, current_backend_id, cart_comm, alloc_size, in, ierr); DTFFT_CHECK(ierr)
      call alloc_mem(helper, current_backend_id, cart_comm, alloc_size, out, ierr); DTFFT_CHECK(ierr)

      call c_f_pointer(in, pin, [min_buffer_size])
      call c_f_pointer(out, pout, [min_buffer_size])

      is_aux_alloc = alloc_and_set_aux(helper, current_backend_id, cart_comm, aux, paux, plans)

      testing_phase = "Testing backend "//dtfft_get_backend_string(current_backend_id)
      PHASE_BEGIN(testing_phase, COLOR_AUTOTUNE)
      WRITE_INFO(testing_phase)

      PHASE_BEGIN("Warmup, "//int_to_str(n_warmup_iters)//" iterations", COLOR_TRANSPOSE)
      do iter = 1, n_warmup_iters
        do i = 1, 2_int8 * n_transpose_plans
          call plans(i)%execute(pin, pout, stream, paux)
        enddo
      enddo
      CUDA_CALL( "cudaStreamSynchronize", cudaStreamSynchronize(stream) )
      PHASE_END("Warmup, "//int_to_str(n_warmup_iters)//" iterations")

      call MPI_Barrier(cart_comm, mpi_ierr)

      PHASE_BEGIN("Testing, "//int_to_str(n_iters)//" iterations", COLOR_EXECUTE)
      total_time = 0.0

      CUDA_CALL( "cudaEventRecord", cudaEventRecord(timer_start, stream) )
      do iter = 1, n_iters
        do i = 1, 2_int8 * n_transpose_plans
          call plans(i)%execute(pin, pout, stream, paux)
        enddo
      enddo
      CUDA_CALL( "cudaEventRecord", cudaEventRecord(timer_stop, stream) )
      CUDA_CALL( "cudaEventSynchronize", cudaEventSynchronize(timer_stop) )
      CUDA_CALL( "cudaEventElapsedTime", cudaEventElapsedTime(execution_time, timer_start, timer_stop) )
      execution_time = execution_time / real(n_iters, real32)
      total_time = total_time + execution_time

      PHASE_END("Testing, "//int_to_str(n_iters)//" iterations")

      call MPI_Allreduce(total_time, min_execution_time, 1, MPI_REAL4, MPI_MIN, cart_comm, mpi_ierr)
      call MPI_Allreduce(total_time, max_execution_time, 1, MPI_REAL4, MPI_MAX, cart_comm, mpi_ierr)
      call MPI_Allreduce(total_time, avg_execution_time, 1, MPI_REAL4, MPI_SUM, cart_comm, mpi_ierr)

      avg_execution_time = avg_execution_time / real(comm_size, real32)

      WRITE_INFO("  max: "//double_to_str(real(max_execution_time, real64))//" [ms]")
      WRITE_INFO("  min: "//double_to_str(real(min_execution_time, real64))//" [ms]")
      WRITE_INFO("  avg: "//double_to_str(real(avg_execution_time, real64))//" [ms]")

      if ( avg_execution_time < best_time_ ) then
        best_time_ = avg_execution_time
        best_backend_ = current_backend_id
      endif

      call free_mem(helper, current_backend_id, in, ierr)
      call free_mem(helper, current_backend_id, out, ierr)
      if ( is_aux_alloc ) call free_mem(helper, current_backend_id, aux, ierr)


      do i = 1, 2_int8 * n_transpose_plans
        call plans(i)%destroy()
      enddo

      PHASE_END("Testing backend "//dtfft_get_backend_string(current_backend_id))
    enddo

    deallocate( plans )
    CUDA_CALL( "cudaEventDestroy", cudaEventDestroy(timer_start) )
    CUDA_CALL( "cudaEventDestroy", cudaEventDestroy(timer_stop) )

    call helper%destroy()

    if ( present(best_time)) best_time = best_time_
    if ( present(best_backend) ) best_backend = best_backend_
  end subroutine run_autotune_backend

  function alloc_and_set_aux(helper, backend, cart_comm, aux, paux, plans, bplans) result(is_aux_alloc)
  !! Allocates auxiliary memory according to the backend and sets it to the plans
    type(backend_helper),         intent(inout)             :: helper       !! Backend helper
    type(dtfft_backend_t),        intent(in)                :: backend      !! GPU backend
    TYPE_MPI_COMM,                intent(in)                :: cart_comm    !! Cartesian communicator
    type(c_ptr),                  intent(inout)             :: aux          !! Allocatable auxiliary memory
    real(real32),     pointer,    intent(inout)             :: paux(:)      !! Pointer to auxiliary memory
    type(transpose_handle_cuda),  intent(inout)             :: plans(:)     !! Plans
    type(transpose_handle_cuda),  intent(inout),  optional  :: bplans(:)    !! Backward plans
    logical                                                 :: is_aux_alloc !! Is auxiliary memory allocated
    integer(int64), allocatable :: worksizes(:)
    integer(int64) :: max_work_size_local, max_work_size_global
    integer(int32)  :: mpi_ierr, n_transpose_plans, i, n_fplans, n_bplans
    integer(int32) :: alloc_ierr

    n_fplans = size(plans)
    n_bplans = 0;  if ( present(bplans) ) n_bplans = size(bplans)
    n_transpose_plans = n_fplans + n_bplans

    allocate( worksizes( n_transpose_plans ) )

    do i = 1, n_fplans
      worksizes(i) = plans(i)%get_aux_size()
    enddo
    do i = 1, n_bplans
      worksizes(i + n_fplans) = bplans(i)%get_aux_size()
    enddo

    max_work_size_local = maxval(worksizes)
    call MPI_Allreduce(max_work_size_local, max_work_size_global, 1, MPI_INTEGER8, MPI_MAX, cart_comm, mpi_ierr)

    is_aux_alloc = .false.
    if ( max_work_size_global > 0 ) then
      call alloc_mem(helper, backend, cart_comm, max_work_size_global, aux, alloc_ierr)
      DTFFT_CHECK(alloc_ierr)
      call c_f_pointer(aux, paux, [max_work_size_global / 4_int64])
      is_aux_alloc = .true.
    endif

    deallocate( worksizes )
  end function alloc_and_set_aux
end module dtfft_transpose_plan_cuda