dtfft_transpose_plan.F90 Source File


This file depends on

sourcefile~~dtfft_transpose_plan.f90~~EfferentGraph sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_transpose_handle.f90 dtfft_abstract_transpose_handle.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_abstract_transpose_handle.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nccl.f90 dtfft_interface_nccl.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_interface_nvrtc.f90 dtfft_interface_nvrtc.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_interface_nvshmem.f90 dtfft_interface_nvshmem.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_kernel_device.f90 dtfft_kernel_device.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_datatype.f90 dtfft_transpose_handle_datatype.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_transpose_handle_datatype.f90 sourcefile~dtfft_transpose_handle_generic.f90 dtfft_transpose_handle_generic.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_transpose_handle_generic.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_kernel.f90 dtfft_abstract_kernel.F90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_abstract_transpose_handle.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_abstract_transpose_handle.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_transpose_handle.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90 dtfft_nvrtc_block_optimizer.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module_cache.f90 dtfft_nvrtc_module_cache.F90 sourcefile~dtfft_kernel_device.f90->sourcefile~dtfft_nvrtc_module_cache.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_datatype.f90->sourcefile~dtfft_abstract_transpose_handle.f90 sourcefile~dtfft_transpose_handle_datatype.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_transpose_handle_datatype.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_handle_datatype.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_handle_datatype.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_datatype.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_abstract_transpose_handle.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_kernel_device.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft_kernel_host.f90 dtfft_kernel_host.F90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cufft.f90 dtfft_interface_cufft.F90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cufft.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_block_optimizer.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90 sourcefile~dtfft_nvrtc_module.f90 dtfft_nvrtc_module.F90 sourcefile~dtfft_nvrtc_module_cache.f90->sourcefile~dtfft_nvrtc_module.f90 sourcefile~dtfft_interface_cufft.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cufft.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_nvrtc_module.f90->sourcefile~dtfft_nvrtc_block_optimizer.f90

Files dependent on this one

sourcefile~~dtfft_transpose_plan.f90~~AfferentGraph sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
#include "dtfft.f03"
module dtfft_transpose_plan
!! This module describes [[transpose_plan]] class
use iso_fortran_env
use iso_c_binding
use dtfft_abstract_backend,               only: backend_helper
use dtfft_abstract_transpose_handle,      only: abstract_transpose_handle, create_args, execute_args
use dtfft_config
use dtfft_errors
#ifdef DTFFT_WITH_CUDA
use dtfft_interface_cuda_runtime
use dtfft_kernel_device,                  only: DEF_TILE_SIZE
use dtfft_interface_cuda,                 only: load_cuda
use dtfft_interface_nvrtc,                only: load_nvrtc
# ifdef NCCL_HAVE_COMMREGISTER
use dtfft_abstract_backend,               only: NCCL_REGISTER_PREALLOC_SIZE
# endif
# ifdef DTFFT_WITH_NVSHMEM
use dtfft_interface_nvshmem
# endif
# ifdef DTFFT_WITH_NCCL
use dtfft_interface_nccl
# endif
#endif
use dtfft_parameters
use dtfft_pencil,                         only: pencil, pencil_init, get_local_sizes
use dtfft_transpose_handle_generic,       only: transpose_handle_generic
use dtfft_transpose_handle_datatype,      only: transpose_handle_datatype
use dtfft_utils
#include "_dtfft_mpi.h"
#include "_dtfft_profile.h"
#include "_dtfft_cuda.h"
#include "_dtfft_private.h"
implicit none
private
public :: transpose_plan

  type :: plan_t
  !! This type is a container for allocatable transpose handles
    class(abstract_transpose_handle), allocatable :: p  !! Transpose handle
  end type plan_t

  integer(int8), save :: FORWARD_PLAN_IDS(3)
    !! Default data types for forward transpositions
  integer(int8), save :: BACKWARD_PLAN_IDS(3)
    !! Default data types for backward transpositions
  logical,       save :: ARE_DATATYPES_SET = .false.
    !! Are default data types set

  type :: transpose_plan
  !! Transpose Plan class
  !! This class is a container for transposition plans
  private
    type(dtfft_backend_t)     :: backend
      !! Backend
    type(backend_helper)      :: helper
      !! Backend helper
    logical                   :: is_z_slab
      !! Z-slab optimization flag (for 3D transforms)
    integer(int64)            :: min_buffer_size
      !! Minimal buffer size for transposition
    type(dtfft_platform_t)    :: platform
      !! Platform used for transposition
    type(dtfft_stream_t)      :: stream
      !! CUDA stream
    type(c_ptr)               :: aux
      !! Auxiliary memory
    real(real32), pointer     :: paux(:)
      !! Pointer to auxiliary memory
    logical                   :: is_aux_alloc = .false.
      !! Is auxiliary memory allocated
    type(plan_t), allocatable :: plans(:)
    !! Plans for each transposition
  contains
  private
    procedure,  non_overridable,  pass(self), public :: create            !! Creates transpose plan
    procedure,  non_overridable,  pass(self), public :: execute           !! Executes transposition
    procedure,  non_overridable,  pass(self), public :: execute_end       !! Finishes asynchronous transposition
    procedure,  non_overridable,  pass(self), public :: get_async_active  !! Returns .true. if any of the plans is running asynchronously
    procedure,  non_overridable,  pass(self), public :: destroy           !! Destroys transpose plan
    procedure,  non_overridable,  pass(self), public :: get_aux_size      !! Returns auxiliary buffer size
    procedure,  non_overridable,  pass(self), public :: get_backend       !! Returns backend id
    procedure,  non_overridable,  pass(self), public :: get_z_slab        !! Returns .true. if Z-slab optimization is enabled
    procedure,  non_overridable,  pass(self), public :: mem_alloc         !! Allocates memory based on selected backend
    procedure,  non_overridable,  pass(self), public :: mem_free          !! Frees memory allocated with mem_alloc
  end type transpose_plan

contains

  function create(self, platform, dims, base_comm, effort, base_dtype, base_storage, cart_comm, comms, pencils, ipencil) result(error_code)
  !! Creates transposition plan
    class(transpose_plan),          intent(inout) :: self           !! Transposition class
    type(dtfft_platform_t),         intent(in)    :: platform       !! Platform to create plan for
    integer(int32),                 intent(in)    :: dims(:)        !! Global sizes of the transform requested
    TYPE_MPI_COMM,                  intent(in)    :: base_comm      !! Base communicator
    type(dtfft_effort_t),           intent(in)    :: effort         !! ``dtFFT`` planner type of effort
    TYPE_MPI_DATATYPE,              intent(in)    :: base_dtype     !! Base MPI_Datatype
    integer(int64),                 intent(in)    :: base_storage   !! Number of bytes needed to store single element
    TYPE_MPI_COMM,                  intent(out)   :: cart_comm      !! Cartesian communicator
    TYPE_MPI_COMM,                  intent(out)   :: comms(:)       !! Array of 1d communicators
    type(pencil),                   intent(out)   :: pencils(:)     !! Data distributing meta
    type(pencil_init),    optional, intent(in)    :: ipencil        !! Pencil passed by user
    integer(int32)                                :: error_code     !! Error code
    integer(int32),   allocatable   :: transposed_dims(:,:) !! Global counts in transposed coordinates
    logical           :: cond1    !! First condition for Z-slab optimization
    logical           :: cond2    !! Second condition for Z-slab optimization
    integer(int32),   allocatable   :: comm_dims(:)   !! Dims in cartesian communicator
    integer(int8)     :: ndims      !! Number of dimensions
    integer(int32)    :: comm_size  !! Number of MPI processes
    integer(int32)    :: top_type   !! Topology type
    integer(int32)    :: ierr       !! Error code
    logical           :: is_custom_cart_comm  !! Custom cartesian communicator provided by user
    integer(int8)     :: d
    TYPE_MPI_COMM     :: base_comm_
    integer(int8) :: n_transpose_plans
    integer(int32), allocatable :: best_decomposition(:)
    logical :: pencils_created
    real(real64) :: ts, te
    integer(int8) :: best_forward_ids(3), best_backward_ids(3)
    logical :: invalid_grid_selected


    call MPI_Comm_size(base_comm, comm_size, ierr)
    call MPI_Topo_test(base_comm, top_type, ierr)
    base_comm_ = base_comm

    ndims = size(dims, kind=int8)
    allocate( comm_dims(ndims) )
    comm_dims(:) = 0

    is_custom_cart_comm = .false.
    self%is_z_slab = .false.

    invalid_grid_selected = .false.
    if ( present(ipencil) ) then
      is_custom_cart_comm = .true.
      do d = 1, ndims
        call MPI_Comm_size(ipencil%comms(d), comm_dims(d), ierr)
      enddo
      if ( comm_dims(1) /= 1 ) then
        error_code = DTFFT_ERROR_INVALID_COMM_FAST_DIM
        return
      endif
      if ( ndims == 3 .and. comm_dims(2) == 1 .and. get_conf_z_slab_enabled() ) then
        self%is_z_slab = .true.
        base_comm_ = ipencil%comms(3)
      endif
    else ! ipencil not present
      if ( top_type == MPI_CART ) then
        is_custom_cart_comm = .true.
        block
          integer(int32)                 :: grid_ndims           ! Number of dims in user defined cartesian communicator
          integer(int32),  allocatable   :: temp_dims(:)         ! Temporary dims needed by MPI_Cart_get
          integer(int32),  allocatable   :: temp_coords(:)       ! Temporary coordinates needed by MPI_Cart_get
          logical,         allocatable   :: temp_periods(:)      ! Temporary periods needed by MPI_Cart_get

          call MPI_Cartdim_get(base_comm_, grid_ndims, ierr)
          if ( grid_ndims > ndims ) then
            error_code = DTFFT_ERROR_INVALID_COMM_DIMS
            return
          endif
          comm_dims(:) = 1
          allocate(temp_dims(grid_ndims), temp_periods(grid_ndims), temp_coords(grid_ndims))
          call MPI_Cart_get(base_comm_, grid_ndims, temp_dims, temp_periods, temp_coords, ierr)
          if ( grid_ndims == ndims ) then
            if ( temp_dims(1) /= 1 ) then
              error_code = DTFFT_ERROR_INVALID_COMM_FAST_DIM
              return
            endif
            comm_dims(:) = temp_dims
          elseif ( grid_ndims == ndims - 1 ) then
            comm_dims(2:) = temp_dims
          elseif ( grid_ndims == ndims - 2 ) then
            comm_dims(3) = temp_dims(1)
          endif
          deallocate(temp_dims, temp_periods, temp_coords)

          do d = 2, ndims
            if ( comm_dims(d) > dims(d) ) then
              WRITE_WARN("Number of MPI processes in direction "//to_str(d)//" greater then number of physical points: "//to_str(comm_dims(d))//" > "//to_str(dims(d)))
            endif
          enddo
          if ( ndims == 3 .and. comm_dims(2) == 1 .and. get_conf_z_slab_enabled() ) then
            self%is_z_slab = .true.
          endif
        endblock
      else !  top_type /= MPI_CART
        comm_dims(:) = 0
        comm_dims(1) = 1
        cond1 = comm_size <= dims(ndims)
        cond2 = comm_size <= dims(1) .and. comm_size <= dims(2)
#ifdef DTFFT_WITH_CUDA
        if ( platform == DTFFT_PLATFORM_CUDA ) then
          cond1 = DEF_TILE_SIZE <= dims(ndims) / comm_size
          cond2 = DEF_TILE_SIZE <= dims(1) / comm_size .and. DEF_TILE_SIZE <= dims(2) / comm_size
        endif
#endif
        if ( ndims == 3 .and. cond1 ) then
          comm_dims(2) = 1
          comm_dims(3) = comm_size
          self%is_z_slab = get_conf_z_slab_enabled()
        else if (ndims == 3 .and. cond2 ) then
          comm_dims(2) = comm_size
          comm_dims(3) = 1
        endif
        call MPI_Dims_create(comm_size, int(ndims, int32), comm_dims, ierr)
        if(dims(ndims - 1) < comm_dims(ndims - 1) .or. dims(ndims) < comm_dims(ndims) ) then
          WRITE_WARN("Unable to create correct grid decomposition.")
          invalid_grid_selected = .true.
          ! WRITE_WARN("Fallback to Z slab is used")
          ! comm_dims(ndims - 1) = 1
          ! comm_dims(ndims) = comm_size
        endif
      endif
    endif

    allocate(transposed_dims(ndims, ndims))
    if ( ndims == 2 ) then
      ! Nx x Ny
      transposed_dims(:, 1) = dims(:)
      ! Ny x Nx
      transposed_dims(1, 2) = dims(2)
      transposed_dims(2, 2) = dims(1)
    else
      ! Nx x Ny x Nz
      transposed_dims(:, 1) = dims(:)
      ! Ny x Nx x Nz
      transposed_dims(1, 2) = dims(2)
      transposed_dims(2, 2) = dims(1)
      transposed_dims(3, 2) = dims(3)
      ! Nz x Nx x Ny
      transposed_dims(1, 3) = dims(3)
      transposed_dims(2, 3) = dims(1)
      transposed_dims(3, 3) = dims(2)
    endif


    error_code = DTFFT_SUCCESS
    if ( .not. ARE_DATATYPES_SET) then
      FORWARD_PLAN_IDS(1) = get_datatype_from_env("DTYPE_X_Y"); BACKWARD_PLAN_IDS(1) = get_datatype_from_env("DTYPE_Y_X")
      FORWARD_PLAN_IDS(2) = get_datatype_from_env("DTYPE_Y_Z"); BACKWARD_PLAN_IDS(2) = get_datatype_from_env("DTYPE_Z_Y")
      FORWARD_PLAN_IDS(3) = get_datatype_from_env("DTYPE_X_Z"); BACKWARD_PLAN_IDS(3) = get_datatype_from_env("DTYPE_Z_X")
      ARE_DATATYPES_SET = .true.
    endif

    best_forward_ids(:) = FORWARD_PLAN_IDS(:)
    best_backward_ids(:) = BACKWARD_PLAN_IDS(:)

    self%platform = platform
    self%backend = get_conf_backend()

    if ( platform == DTFFT_PLATFORM_HOST ) then
      if ( .not. get_conf_datatype_enabled() .and. .not. get_conf_mpi_enabled() .and. effort == DTFFT_PATIENT) then
        error_code = DTFFT_ERROR_BACKENDS_DISABLED
        return
      endif
#ifdef DTFFT_WITH_CUDA
    else
      if ( .not.get_conf_mpi_enabled() .and. .not.get_conf_nccl_enabled() .and. .not.get_conf_nvshmem_enabled() .and. effort == DTFFT_PATIENT) then
        error_code = DTFFT_ERROR_BACKENDS_DISABLED
        return
      endif

      CHECK_CALL( load_cuda(), error_code )
      CHECK_CALL( load_nvrtc(), error_code )
      self%stream = get_conf_stream()

! # ifdef DTFFT_WITH_NVSHMEM
!       if ( is_backend_nvshmem(self%backend) .or. (get_conf_nvshmem_enabled() .and. effort == DTFFT_PATIENT) ) then
!         call nvshmem_init(base_comm_)
!       endif
! # endif
#endif
    endif



    ! if ( platform == DTFFT_PLATFORM_HOST .and. (is_backend_nccl(self%backend) .or. is_backend_nvshmem(self%backend)) ) then
    ! ! Do not raise error here, just fallback to MPI_P2P
    !   self%backend = DTFFT_BACKEND_MPI_DATATYPE
    ! endif

    allocate( best_decomposition(ndims) )
    best_decomposition(:) = comm_dims(:)
    call MPI_Comm_size(base_comm_, comm_size, ierr)
    if ( comm_size == 1 .and. self%backend /= DTFFT_BACKEND_MPI_DATATYPE ) self%backend = BACKEND_NOT_SET

    pencils_created = .false.
    if ( ndims == 2 .or. is_custom_cart_comm .or. self%is_z_slab ) then
      pencils_created = .true.
      call create_pencils_and_comm(transposed_dims, base_comm_, comm_dims, cart_comm, comms, pencils, ipencil=ipencil)
    endif

    ts = MPI_Wtime()

    if ( effort == DTFFT_PATIENT .and. comm_size > 1 .and. .not.invalid_grid_selected) then
      if ( pencils_created ) then
        call run_autotune_backend(                                                                              &
          platform, comms, cart_comm, effort, base_dtype, pencils, base_storage, self%stream, self%is_z_slab,   &
          best_forward_ids, best_backward_ids, best_backend=self%backend)
      else
        call autotune_grid_decomposition(                                                                       &
          platform, dims, transposed_dims, base_comm_, effort, base_dtype, base_storage,                        &
          self%stream, best_forward_ids, best_backward_ids, best_decomposition, best_backend=self%backend)
      endif
    else if ( ndims == 3                                &
      .and. .not.is_custom_cart_comm                    &
      .and. .not.self%is_z_slab                         &
      .and. effort == DTFFT_MEASURE                     &
      .and. comm_size > 1 ) then

      call autotune_grid_decomposition(                                                                         &
        platform, dims, transposed_dims, base_comm_, effort, base_dtype, base_storage,                          &
        self%stream, best_forward_ids, best_backward_ids, best_decomposition, backend=self%backend)
    endif
    te = MPI_Wtime()

    if ( effort%val >= DTFFT_MEASURE%val .and. ndims > 2 .and. comm_size > 1 ) then
      WRITE_INFO(repeat("*", 50))
      if ( self%is_z_slab ) then
        WRITE_INFO("Skipped search of MPI processor grid due to Z-slab optimization enabled")
      else if ( is_custom_cart_comm ) then
        WRITE_INFO("Skipped search of MPI processor grid due to custom grid provided")
      else if ( invalid_grid_selected ) then
        WRITE_INFO("Skipped search of MPI processor grid due to lack of work per process")
      else
        WRITE_INFO("DTFFT_MEASURE: Selected MPI processor grid 1x"//to_str(best_decomposition(2))//"x"//to_str(best_decomposition(3)))
      endif
    endif
    if ( effort == DTFFT_PATIENT .and. comm_size > 1 ) then
      WRITE_INFO("DTFFT_PATIENT: Selected backend is "//dtfft_get_backend_string(self%backend))
    endif

    n_transpose_plans = ndims - 1_int8; if( self%is_z_slab ) n_transpose_plans = n_transpose_plans + 1_int8

    if ( effort == DTFFT_PATIENT .and. self%backend == DTFFT_BACKEND_MPI_DATATYPE ) then
      WRITE_INFO("DTFFT_PATIENT: Selected transpose ids:")
      do d = 1, n_transpose_plans
        WRITE_INFO("    "//TRANSPOSE_NAMES( d)//": "//to_str( best_forward_ids(d) ))
        WRITE_INFO("    "//TRANSPOSE_NAMES(-d)//": "//to_str( best_backward_ids(d) ))
      enddo
      WRITE_INFO(repeat("*", 50))
    endif
    if ( effort%val >= DTFFT_MEASURE%val .and. comm_size > 1 ) then
      WRITE_INFO("Time spent on autotune: "//to_str(te - ts)//" [s]")
    endif

    if ( .not.pencils_created ) then
      call create_pencils_and_comm(transposed_dims, base_comm_, best_decomposition, cart_comm, comms, pencils)
    endif

    ts = MPI_Wtime()
    allocate( self%plans(-1 * n_transpose_plans:n_transpose_plans) )
    call allocate_plans(self%plans, self%backend)
    call self%helper%create(platform, cart_comm, comms, is_backend_nccl(self%backend), pencils)

    block
      type(create_args) :: args

      args%platform = platform
      args%helper = self%helper
      args%effort = effort
      args%backend = self%backend
      args%force_effort = .false.
      args%base_type = base_dtype

      do d = 1_int8, ndims - 1_int8
        args%datatype_id = best_forward_ids(d)
        call self%plans(d)%p%create(pencils(d), pencils(d + 1), base_storage, args)
        args%datatype_id = best_backward_ids(d)
        call self%plans(-d)%p%create(pencils(d + 1), pencils(d), base_storage, args)
      enddo
      if ( self%is_z_slab ) then
        args%datatype_id = best_forward_ids(3)
        call self%plans(3)%p%create(pencils(1), pencils(3), base_storage, args)
        args%datatype_id = best_backward_ids(3)
        call self%plans(-3)%p%create(pencils(3), pencils(1), base_storage, args)
      endif
    endblock
    te = MPI_Wtime()
    WRITE_INFO("Time spent creating final plans: "//to_str(te - ts)//" [s]")

    call get_local_sizes(pencils, alloc_size=self%min_buffer_size)
    self%min_buffer_size = self%min_buffer_size * (base_storage / FLOAT_STORAGE_SIZE)
    call alloc_and_set_aux(platform, self%helper, self%backend, cart_comm, self%aux, self%paux, self%plans, self%is_aux_alloc)

    deallocate( best_decomposition, comm_dims, transposed_dims )
    error_code = DTFFT_SUCCESS
  end function create

  logical function get_async_active(self)
  !! Returns .true. if any of the plans is running asynchronously
    class(transpose_plan),   intent(in)    :: self           !! Transposition class
    integer(int32)  :: i

    get_async_active = .false.
    do i = lbound(self%plans, dim=1), ubound(self%plans, dim=1)
      if ( allocated( self%plans(i)%p ) ) then
        get_async_active = get_async_active .or. self%plans(i)%p%get_async_active()
      endif
    enddo
  end function get_async_active

  subroutine execute(self, in, out, transpose_type, exec_type, error_code)
  !! Executes transposition
    class(transpose_plan),        intent(inout) :: self           !! Transposition class
    type(c_ptr),                  intent(in)    :: in             !! Incoming buffer
    type(c_ptr),                  intent(in)    :: out            !! Resulting buffer
    type(dtfft_transpose_t),      intent(in)    :: transpose_type !! Type of transpose to execute
    type(async_exec_t),           intent(in)    :: exec_type      !! Type of execution (sync/async)
    integer(int32),    optional,  intent(out)   :: error_code     !! Error code
    real(real32), pointer :: pin(:)   !! Source buffer
    real(real32), pointer :: pout(:)  !! Destination buffer
    type(execute_args)    :: kwargs   !! Additional arguments for execution
    integer(int32)        :: ierr     !! Error code

    REGION_BEGIN('Transpose '//TRANSPOSE_NAMES(transpose_type%val), COLOR_TRANSPOSE_PALLETTE(transpose_type%val))
    call c_f_pointer(in, pin, [self%min_buffer_size])
    call c_f_pointer(out, pout, [self%min_buffer_size])
    kwargs%exec_type = exec_type
    kwargs%stream = self%stream
    if ( self%is_aux_alloc ) then
      kwargs%p1 => self%paux
    else
      ! Pointer is unused. Pointing to something in order to avoid runtime null pointer errors
      kwargs%p1 => pin
    endif
    call self%plans(transpose_type%val)%p%execute(pin, pout, kwargs, ierr)
    if( present( error_code ) ) error_code = ierr
    REGION_END('Transpose '//TRANSPOSE_NAMES(transpose_type%val))
  end subroutine execute

  subroutine execute_end(self, in, out, transpose_type, error_code)
  !! Finishes asynchronous transposition
    class(transpose_plan),        intent(inout) :: self           !! Transposition class
    type(c_ptr),                  intent(in)    :: in             !! Incoming buffer
    type(c_ptr),                  intent(in)    :: out            !! Resulting buffer
    type(dtfft_transpose_t),      intent(in)    :: transpose_type !! Type of transpose
    integer(int32),               intent(out)   :: error_code     !! Error code
    real(real32),   pointer :: pin(:)   !! Source buffer
    real(real32),   pointer :: pout(:)  !! Destination buffer
    type(execute_args)      :: kwargs   !! Additional arguments for execution

    call c_f_pointer(in, pin, [self%min_buffer_size])
    call c_f_pointer(out, pout, [self%min_buffer_size])

    kwargs%p1 => pin
    kwargs%p2 => pout
    kwargs%stream = self%stream
    call self%plans(transpose_type%val)%p%execute_end(kwargs, error_code)
  end subroutine execute_end

  subroutine destroy(self)
  !! Destroys transposition plans
    class(transpose_plan),    intent(inout) :: self         !! Transposition class
    integer(int32) :: ierr

    if ( self%is_aux_alloc ) then
      call self%mem_free(self%aux, ierr)
      self%paux => null()
      self%is_aux_alloc = .false.
    endif

    if ( allocated( self%plans ) ) then
      call destroy_plans(self%plans)
      deallocate( self%plans )
    endif

    call self%helper%destroy()
! # ifdef DTFFT_WITH_NVSHMEM
!       if ( is_backend_nvshmem(self%backend) ) then
!         call nvshmem_finalize()
!       endif
! # endif
! #ifdef DTFFT_WITH_CUDA
!     if ( self%platform == DTFFT_PLATFORM_CUDA  ) then
!       call cache%cleanup()
!     endif
! #endif
  end subroutine destroy

  logical function get_z_slab(self)
  !! Returns .true. if Z-slab optimization is enabled
    class(transpose_plan),   intent(in)    :: self      !! Transposition class
    get_z_slab = self%is_z_slab
  end function get_z_slab

  subroutine allocate_plans(plans, backend)
  !! Allocates array of plans
    type(plan_t),           intent(inout) :: plans(:)   !! Plans to allocate
    type(dtfft_backend_t),  intent(in)    :: backend    !! Backend to use
    integer(int32) :: i

    do i = 1, size(plans)
      if ( backend == DTFFT_BACKEND_MPI_DATATYPE ) then
        allocate( transpose_handle_datatype :: plans(i)%p )
      else
        allocate( transpose_handle_generic :: plans(i)%p )
      endif
    enddo
  end subroutine allocate_plans

  subroutine destroy_plans(plans)
  !! Destroys array of plans
    type(plan_t),           intent(inout) :: plans(:) !! Plans to destroy
    integer(int32) :: i

    do i = 1, size(plans)
      if( allocated(plans(i)%p) ) then
        call plans(i)%p%destroy()
        deallocate(plans(i)%p)
      endif
    enddo
  end subroutine destroy_plans

  subroutine autotune_grid_decomposition(                                                             &
    platform, dims, transposed_dims, base_comm, effort, base_dtype, base_storage, stream,             &
    best_forward_ids, best_backward_ids, best_decomposition, backend, min_execution_time, best_backend)
  !! Runs through all possible grid decompositions and selects the best one based on the lowest average execution time
    type(dtfft_platform_t),           intent(in)    :: platform
      !! Platform to use
    integer(int32),                   intent(in)    :: dims(:)
      !! Global sizes of the transform requested
    integer(int32),                   intent(in)    :: transposed_dims(:,:)
      !! Transposed dimensions
    TYPE_MPI_COMM,                    intent(in)    :: base_comm
      !! 3D comm
    type(dtfft_effort_t),             intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    TYPE_MPI_DATATYPE,                intent(in)    :: base_dtype
      !! Base MPI_Datatype
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    type(dtfft_stream_t),             intent(in)    :: stream
      !! Stream to use
    integer(int8),                    intent(inout) :: best_forward_ids(:)
      !! Best Datatype ids for forward plan
    integer(int8),                    intent(inout) :: best_backward_ids(:)
      !! Best Datatype ids for backward plan
    integer(int32),                   intent(out)   :: best_decomposition(:)
      !! Best decomposition found
    type(dtfft_backend_t),  optional, intent(in)    :: backend
      !! GPU Backend to test. Should be passed only when effort is `DTFFT_ESTIMATE` or `DTFFT_MEASURE`
    real(real32),           optional, intent(out)   :: min_execution_time
      !! Elapsed time for best plan selected
    type(dtfft_backend_t),  optional, intent(out)   :: best_backend
      !! Best backend selected
    integer(int8)   :: ndims
    type(dtfft_backend_t) :: best_backend_
    integer(int32)  :: comm_size, square_root, i, current_timer, k, ierr
    real(real32) :: current_time, elapsed_time
    real(real32), allocatable :: timers(:)
    integer(int32),   allocatable :: decomps(:,:)
    type(dtfft_backend_t), allocatable :: backends(:)
    integer(int8),  allocatable :: forw_ids(:,:)        !! Best Forward ids are stored in forw_ids(:, latest_timer_id)
    integer(int8),  allocatable :: back_ids(:,:)        !! Best Backward ids are stored in back_ids(:, latest_timer_id)
    integer(int8) :: forw(3), back(3)

    call MPI_Comm_size(base_comm, comm_size, ierr)
    ndims = size(dims, kind=int8)

    square_root = int(sqrt(real(comm_size, real64))) + 1
    allocate(timers(2 * square_root))
    allocate(decomps(2, 2 * square_root))
    allocate(backends(2 * square_root))
    allocate(forw_ids(3, 2 * square_root))
    allocate(back_ids(3, 2 * square_root))

    current_timer = 0
    do i = 1, square_root - 1
      if ( mod( comm_size, i ) /= 0 ) cycle

      forw(:) = best_forward_ids(:)
      back(:) = best_backward_ids(:)

      call autotune_grid(                                                 &
        platform, dims, transposed_dims, base_comm, effort, base_dtype,   &
        [1, i, comm_size / i], base_storage, stream, .false., forw, back, &
        backend=backend, best_time=current_time,best_backend=best_backend_)
      if ( current_time > 0.0 ) then
        current_timer = current_timer + 1
        timers(current_timer) = current_time
        decomps(1, current_timer) = i
        decomps(2, current_timer) = comm_size / i
        forw_ids(:, current_timer) = forw(:)
        back_ids(:, current_timer) = back(:)
        backends(current_timer) = best_backend_
      endif
      if ( i /= comm_size / i) then
        forw(:) = best_forward_ids(:)
        back(:) = best_backward_ids(:)

        call autotune_grid(                                                 &
          platform, dims, transposed_dims, base_comm, effort, base_dtype,   &
          [1, comm_size / i, i], base_storage, stream, .false., forw, back, &
          backend=backend, best_time=current_time,best_backend=best_backend_)
        if ( current_time > 0.0 ) then
          current_timer = current_timer + 1
          timers(current_timer) = current_time
          decomps(1, current_timer) = comm_size / i
          decomps(2, current_timer) = i
          forw_ids(:, current_timer) = forw(:)
          back_ids(:, current_timer) = back(:)
          backends(current_timer) = best_backend_
        endif
      endif
    enddo

    elapsed_time = MAX_REAL32
    k = 1
    do i = 1, current_timer
      if ( timers(i) < elapsed_time ) then
        elapsed_time = timers(i)
        k = i
      endif
    enddo

    best_decomposition(1) = 1
    best_decomposition(2) = decomps(1, k)
    best_decomposition(3) = decomps(2, k)

    best_forward_ids(:) = forw_ids(:, k)
    best_backward_ids(:) = back_ids(:, k)
    if ( present(best_backend) ) best_backend = backends(k)
    if ( present(min_execution_time) ) min_execution_time = elapsed_time
    deallocate( timers, decomps, backends, forw_ids, back_ids )
  end subroutine autotune_grid_decomposition

  subroutine autotune_grid(                                                                                       &
    platform, dims, transposed_dims, base_comm, effort, base_dtype, comm_dims, base_storage, stream, is_z_slab,   &
    best_forward_ids, best_backward_ids, backend, best_time, best_backend)
  !! Creates cartesian grid and runs various backends on it. Returns best backend and execution time
    type(dtfft_platform_t),           intent(in)    :: platform
      !! Platform to create plan for
    integer(int32),                   intent(in)    :: dims(:)
      !! Global sizes of the transform requested
    integer(int32),                   intent(in)    :: transposed_dims(:,:)
      !! Transposed dimensions
    TYPE_MPI_COMM,                    intent(in)    :: base_comm
      !! Basic communicator to create 3d grid from
    type(dtfft_effort_t),             intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    TYPE_MPI_DATATYPE,                intent(in)    :: base_dtype
      !! Base MPI_Datatype
    integer(int32),                   intent(in)    :: comm_dims(:)
      !! Number of processors in each dimension
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    type(dtfft_stream_t),             intent(in)    :: stream
      !! Stream to use
    logical,                          intent(in)    :: is_z_slab
      !! Is Z-slab optimization enabled
    integer(int8),                    intent(inout) :: best_forward_ids(:)
      !! Best Datatype ids for forward plan
    integer(int8),                    intent(inout) :: best_backward_ids(:)
      !! Best Datatype ids for backward plan
    type(dtfft_backend_t),  optional, intent(in)    :: backend
      !! GPU Backend to test. Should be passed only when effort is `DTFFT_ESTIMATE` or `DTFFT_MEASURE`
    type(dtfft_backend_t),  optional, intent(out)   :: best_backend
      !! Best backend selected for the grid
    real(real32),           optional, intent(out)   :: best_time
      !! Elapsed time for best plan selected
    type(pencil), allocatable :: pencils(:)
    character(len=:),             allocatable   :: phase_name           !! Caliper phase name
    integer(int8) :: d, ndims
    TYPE_MPI_COMM, allocatable :: comms(:)
    TYPE_MPI_COMM :: cart_comm
    integer(int32) :: mpi_ierr

    best_time = -1.0
    ndims = size(dims, kind=int8)

    if ( ndims == 3 ) then
      if ( comm_dims(2) > dims(2) .or. comm_dims(3) > dims(3) ) return
      allocate( phase_name, source = "Testing grid 1x"//to_str(comm_dims(2))//"x"//to_str(comm_dims(3)) )
    else
      allocate( phase_name, source = "Testing grid 1x"//to_str(comm_dims(2)) )
    endif

    WRITE_INFO("")
    WRITE_INFO(phase_name)
    REGION_BEGIN(phase_name, COLOR_AUTOTUNE)

    allocate( comms(ndims), pencils(ndims) )
    call create_pencils_and_comm(transposed_dims, base_comm, comm_dims, cart_comm, comms, pencils)

    call run_autotune_backend(                                                                            &
      platform, comms, cart_comm, effort, base_dtype, pencils, base_storage, stream, is_z_slab,           &
      best_forward_ids, best_backward_ids, backend=backend, best_time=best_time, best_backend=best_backend)

    do d = 1, ndims
      call pencils(d)%destroy()
      call MPI_Comm_free(comms(d), mpi_ierr)
    enddo
    call MPI_Comm_free(cart_comm, mpi_ierr)
    deallocate( comms, pencils )
    REGION_END(phase_name)
  end subroutine autotune_grid

  subroutine run_autotune_backend(                                                                &
    platform, comms, cart_comm, effort, base_dtype, pencils, base_storage, stream, is_z_slab,     &
    best_forward_ids, best_backward_ids, backend, best_time, best_backend)
  !! Runs autotune for all backends
    type(dtfft_platform_t),           intent(in)    :: platform
      !! Platform to create plan for
    TYPE_MPI_COMM,                    intent(in)    :: comms(:)
      !! 1D comms
    TYPE_MPI_COMM,                    intent(in)    :: cart_comm
      !! 3D Cartesian comm
    type(dtfft_effort_t),             intent(in)    :: effort
      !!
    TYPE_MPI_DATATYPE,                intent(in)    :: base_dtype
      !! Base MPI_Datatype
    type(pencil),                     intent(in)    :: pencils(:)
      !! Source meta
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    type(dtfft_stream_t),             intent(in)    :: stream
      !! Stream to use
    logical,                          intent(in)    :: is_z_slab
      !! Is Z-slab optimization enabled
    integer(int8),                    intent(inout) :: best_forward_ids(:)
      !! Best Datatype ids for forward plan
    integer(int8),                    intent(inout) :: best_backward_ids(:)
      !! Best Datatype ids for backward plan
    type(dtfft_backend_t),  optional, intent(in)    :: backend
      !! GPU Backend to test. Should be passed only when effort is `DTFFT_ESTIMATE` or `DTFFT_MEASURE`
    real(real32),           optional, intent(out)   :: best_time
      !! Elapsed time for best backend
    type(dtfft_backend_t),  optional, intent(out)   :: best_backend
      !! Best backend selected
    type(dtfft_backend_t),  allocatable :: backends_to_run(:)
    type(dtfft_backend_t) :: current_backend_id, best_backend_
    logical :: is_udb !! Used defined backend
    real(real32) :: execution_time, avg_execution_time, best_time_
    integer(int32) :: iter, comm_size, mpi_ierr, b, ierr
    ! type(transpose_handle_generic),  allocatable   :: plans(:)
    type(plan_t), allocatable :: plans(:)
    integer(int8) :: i, n_transpose_plans
    type(c_ptr) :: in, out, aux
    real(real32), pointer :: pin(:), pout(:), paux(:)
    logical :: is_aux_alloc
    real(real64) :: ts, te
    ! , need_aux
    integer(int64)         :: alloc_size
#ifdef DTFFT_WITH_CUDA
    type(cudaEvent) :: timer_start, timer_stop
#endif
    character(len=:), allocatable :: testing_phase
    type(backend_helper)                      :: helper
    integer(int32) :: n_warmup_iters, n_iters
    integer(int64) :: min_buffer_size
    type(create_args) :: create_kwargs
    type(execute_args) :: execute_kwargs
    integer(int32) :: first
    logical :: pipe_enabled, mpi_enabled, dtype_enabled

    if ( present(backend) ) then
      allocate( backends_to_run(1) )
      backends_to_run(1) = backend
      is_udb = .true.
    else
      if ( platform == DTFFT_PLATFORM_HOST ) then
        first = 1
      else
        first = 2
      endif
      allocate(backends_to_run(size(VALID_BACKENDS(first:))))
      do b = 1, size(backends_to_run)
        backends_to_run(b) = VALID_BACKENDS(first - 1 + b)
      enddo
      is_udb = .false.
    endif
    best_backend_ = backends_to_run(1)

    if ( is_z_slab ) then
      n_transpose_plans = 1
    else
      n_transpose_plans = size(pencils, kind=int8) - 1_int8
    endif

    allocate( plans(2 * n_transpose_plans) )

    call MPI_Comm_size(cart_comm, comm_size, mpi_ierr)

    call helper%create(platform, cart_comm, comms, any(is_backend_nccl(backends_to_run)), pencils)

    call get_local_sizes(pencils, alloc_size=alloc_size)
    alloc_size = alloc_size * base_storage
    min_buffer_size = alloc_size / FLOAT_STORAGE_SIZE

    create_kwargs%effort = DTFFT_ESTIMATE
    create_kwargs%force_effort = .true.
    create_kwargs%platform = platform
    create_kwargs%helper = helper
    create_kwargs%base_type = base_dtype

    execute_kwargs%exec_type = EXEC_BLOCKING
    execute_kwargs%stream = stream

#ifdef DTFFT_WITH_CUDA
    if ( platform == DTFFT_PLATFORM_CUDA ) then
      CUDA_CALL( cudaEventCreate(timer_start) )
      CUDA_CALL( cudaEventCreate(timer_stop) )
    endif
#endif

    n_warmup_iters = get_conf_measure_warmup_iters()
    n_iters = get_conf_measure_iters()
    pipe_enabled = get_conf_pipelined_enabled()
    dtype_enabled = get_conf_datatype_enabled()
    mpi_enabled = get_conf_mpi_enabled()

    best_time_ = MAX_REAL32

    do b = 1, size(backends_to_run)
      current_backend_id = backends_to_run(b)

      if ( ((is_backend_pipelined(current_backend_id) .and. .not.pipe_enabled)                           &
            .or.(is_backend_mpi(current_backend_id) .and. .not.mpi_enabled)                             &
            .or.(current_backend_id == DTFFT_BACKEND_MPI_DATATYPE .and. .not.dtype_enabled))             &
            .and. .not.is_udb) cycle
#ifdef DTFFT_WITH_CUDA
      if ( platform == DTFFT_PLATFORM_CUDA ) then
        if ( ( is_backend_nvshmem(current_backend_id) .and. .not.get_conf_nvshmem_enabled())            &
              .and. .not.is_udb) cycle
      else
        if ( is_backend_nccl(current_backend_id) .or. is_backend_nvshmem(current_backend_id) ) cycle
      endif
#endif
      call allocate_plans(plans, current_backend_id)

      if ( .not. is_backend_nvshmem(current_backend_id) ) then
        call alloc_mem(platform, helper, current_backend_id, cart_comm, alloc_size, in, ierr); DTFFT_CHECK(ierr)
        call alloc_mem(platform, helper, current_backend_id, cart_comm, alloc_size, out, ierr); DTFFT_CHECK(ierr)

        call c_f_pointer(in, pin, [min_buffer_size])
        call c_f_pointer(out, pout, [min_buffer_size])
      endif

      testing_phase = "Testing backend "//dtfft_get_backend_string(current_backend_id)
      REGION_BEGIN(testing_phase, COLOR_AUTOTUNE2)
      WRITE_INFO(testing_phase)

      is_aux_alloc = .false.
      if ( current_backend_id == DTFFT_BACKEND_MPI_DATATYPE .and. effort == DTFFT_PATIENT) then
        call run_autotune_datatypes(helper, base_dtype, pencils, base_storage, is_z_slab, best_forward_ids, best_backward_ids, pin, pout, avg_execution_time)
      else
        create_kwargs%backend = current_backend_id

        if ( is_z_slab ) then
          create_kwargs%datatype_id = best_forward_ids(3)
          call plans(1)%p%create(pencils(1), pencils(3), base_storage, create_kwargs)
          create_kwargs%datatype_id = best_backward_ids(3)
          call plans(2)%p%create(pencils(3), pencils(1), base_storage, create_kwargs)
        else
          do i = 1, n_transpose_plans
            create_kwargs%datatype_id = best_forward_ids(i)
            call plans(i)%p%create(pencils(i), pencils(i + 1), base_storage, create_kwargs)
            create_kwargs%datatype_id = best_backward_ids(i)
            call plans(i + n_transpose_plans)%p%create(pencils(i + 1), pencils(i), base_storage, create_kwargs)
          enddo
        endif

        if ( is_backend_nvshmem(current_backend_id) ) then
          !! Symmetric heap can be allocated after nvshmem_init, which is done during plan creation
          call alloc_mem(platform, helper, current_backend_id, cart_comm, alloc_size, in, ierr); DTFFT_CHECK(ierr)
          call alloc_mem(platform, helper, current_backend_id, cart_comm, alloc_size, out, ierr); DTFFT_CHECK(ierr)

          call c_f_pointer(in, pin, [min_buffer_size])
          call c_f_pointer(out, pout, [min_buffer_size])
        endif

        call alloc_and_set_aux(platform, helper, current_backend_id, cart_comm, aux, paux, plans, is_aux_alloc)
        if( is_aux_alloc ) then
          execute_kwargs%p1 => paux
        else
          execute_kwargs%p1 => pout
        endif

        REGION_BEGIN("Warmup", COLOR_TRANSPOSE)
        do iter = 1, n_warmup_iters
          do i = 1, 2_int8 * n_transpose_plans
            call plans(i)%p%execute(pin, pout, execute_kwargs, ierr)
          enddo
        enddo
#ifdef DTFFT_WITH_CUDA
        if ( platform == DTFFT_PLATFORM_CUDA ) then
          CUDA_CALL( cudaStreamSynchronize(stream) )
        endif
#endif
        REGION_END("Warmup")

        call MPI_Barrier(cart_comm, mpi_ierr)

        REGION_BEGIN("Measure", COLOR_EXECUTE)
        if ( platform == DTFFT_PLATFORM_HOST ) then
          ts = MPI_Wtime()
#ifdef DTFFT_WITH_CUDA
        else
          CUDA_CALL( cudaEventRecord(timer_start, stream) )
#endif
        endif
        do iter = 1, n_iters
          do i = 1, 2_int8 * n_transpose_plans
            call plans(i)%p%execute(pin, pout, execute_kwargs, ierr)
          enddo
        enddo
        if ( platform == DTFFT_PLATFORM_HOST ) then
          te = MPI_Wtime()
          execution_time = real(te - ts, real32) * 1000._real32
#ifdef DTFFT_WITH_CUDA
        else
          CUDA_CALL( cudaEventRecord(timer_stop, stream) )
          CUDA_CALL( cudaEventSynchronize(timer_stop) )
          CUDA_CALL( cudaEventElapsedTime(execution_time, timer_start, timer_stop) )
#endif
        endif
        REGION_END("Measure")
        avg_execution_time = report_timings(cart_comm, execution_time, n_iters)
      endif
      ! execution_time = execution_time / real(n_iters, real32)


      ! call MPI_Allreduce(execution_time, min_execution_time, 1, MPI_REAL4, MPI_MIN, cart_comm, mpi_ierr)
      ! call MPI_Allreduce(execution_time, max_execution_time, 1, MPI_REAL4, MPI_MAX, cart_comm, mpi_ierr)
      ! call MPI_Allreduce(execution_time, avg_execution_time, 1, MPI_REAL4, MPI_SUM, cart_comm, mpi_ierr)

      ! avg_execution_time = avg_execution_time / real(comm_size, real32)

      ! WRITE_INFO("  max: "//to_str(real(max_execution_time, real64))//" [ms]")
      ! WRITE_INFO("  min: "//to_str(real(min_execution_time, real64))//" [ms]")
      ! WRITE_INFO("  avg: "//to_str(real(avg_execution_time, real64))//" [ms]")

      if ( avg_execution_time < best_time_ ) then
        best_time_ = avg_execution_time
        best_backend_ = current_backend_id
      endif

      call free_mem(platform, helper, current_backend_id, in, ierr)
      call free_mem(platform, helper, current_backend_id, out, ierr)
      if ( is_aux_alloc ) then
        call free_mem(platform, helper, current_backend_id, aux, ierr)
        is_aux_alloc = .false.
      endif

      call destroy_plans(plans)
      REGION_END("Testing backend "//dtfft_get_backend_string(current_backend_id))
    enddo

    deallocate( plans )
#ifdef DTFFT_WITH_CUDA
    if ( platform == DTFFT_PLATFORM_CUDA ) then
      CUDA_CALL( cudaEventDestroy(timer_start) )
      CUDA_CALL( cudaEventDestroy(timer_stop) )
    endif
#endif
    call helper%destroy()

    if ( present(best_time)) best_time = best_time_
    if ( present(best_backend) ) best_backend = best_backend_
  end subroutine run_autotune_backend

  subroutine run_autotune_datatypes(                                                              &
    helper, base_dtype, pencils, base_storage,                                                    &
    is_z_slab, best_forward_ids, best_backward_ids, a, b, elapsed_time)
    type(backend_helper),             intent(inout) :: helper
    TYPE_MPI_DATATYPE,                intent(in)    :: base_dtype
      !! Base MPI_Datatype
    type(pencil),                     intent(in)    :: pencils(:)
      !! Source meta
    integer(int64),                   intent(in)    :: base_storage
      !! Number of bytes needed to store single element
    logical,                          intent(in)    :: is_z_slab
      !! Is Z-slab optimization enabled
    integer(int8),                    intent(out)   :: best_forward_ids(:)
      !! Best Datatype ids for forward plan
    integer(int8),                    intent(out)   :: best_backward_ids(:)
      !! Best Datatype ids for backward plan
    real(real32),                     intent(inout) :: a(:)
      !! Source buffer
    real(real32),                     intent(inout) :: b(:)
      !! Target buffer
    real(real32),                     intent(out)   :: elapsed_time
      !! Elapsed time for best plans selected in [ms]
    integer(int8) :: dim        !! Counter
    integer(int8) :: ndims      !! Number of dimensions

    ndims = size(pencils, kind=int8)
    elapsed_time = 0._real32
    if( is_z_slab ) then
      elapsed_time = autotune_transpose_id(helper, pencils(1), pencils(3),                        &
        base_dtype, base_storage, 3_int8, a, b, best_forward_ids(3), best_backward_ids(3))
    else
      do dim = 1_int8, ndims - 1_int8
        elapsed_time = elapsed_time                                                               &
        + autotune_transpose_id(helper, pencils(dim), pencils(dim + 1),                           &
            base_dtype, base_storage, dim, a, b, best_forward_ids(dim), best_backward_ids(dim))
      enddo
    endif
    WRITE_INFO("  Execution time on a grid using fastest transpositions: "//to_str(real(elapsed_time, real64))//" [ms]")
  end subroutine run_autotune_datatypes

  function autotune_transpose_id(helper, from, to, base_dtype, base_storage, transpose_name_id, a, b, forward_id, backward_id) result(elapsed_time)
  !! Creates forward and backward transpose plans for backend `DTFFT_BACKEND_MPI_DATATYPE` based on source and target data distributions and,
  !! executes them `DTFFT_MEASURE_ITERS` times ( 4 * `DTFFT_MEASURE_ITERS` iterations total ) + 4 * `DTFFT_MEASURE_WARMUP_ITERS` warmup iterations
  !!
  !! Returns elapsed time for best plans selected
    type(backend_helper),         intent(inout) :: helper               !! Backend helper
    type(pencil),                 intent(in)    :: from                 !! Source meta
    type(pencil),                 intent(in)    :: to                   !! Target meta
    TYPE_MPI_DATATYPE,            intent(in)    :: base_dtype           !! Basic MPI Datatype
    integer(int64),               intent(in)    :: base_storage         !! Number of bytes needed to store Basic MPI Datatype
    integer(int8),                intent(in)    :: transpose_name_id    !! ID of transpose name (from -3 to 3, except 0)
    real(real32),                 intent(inout) :: a(:)                 !! Source buffer
    real(real32),                 intent(inout) :: b(:)                 !! Target buffer
    integer(int8),                intent(out)   :: forward_id           !! Best forward plan ID
    integer(int8),                intent(out)   :: backward_id          !! Best backward plan ID
    real(real32)                                :: elapsed_time         !! Elapsed time for best plans selected
    real(real32)                                :: forward_time         !! Forward plan execution time
    real(real32)                                :: backward_time        !! Backward plan execution time
    real(real32)                                :: time                 !! Timer
    integer(int8)                               :: datatype_id          !! Counter

    forward_time = huge(1._real32)
    backward_time = huge(1._real32)

    do datatype_id = 1, 2
      time = get_plan_execution_time(helper, from, to, base_dtype, base_storage, datatype_id, transpose_name_id, a, b)
      if ( time < forward_time ) then
        forward_time = time
        forward_id = datatype_id
      endif

      time = get_plan_execution_time(helper, to, from, base_dtype, base_storage, datatype_id, -1_int8 * transpose_name_id, a, b)
      if ( time < backward_time ) then
        backward_time = time
        backward_id = datatype_id
      endif
    enddo
    elapsed_time = forward_time + backward_time
  end function autotune_transpose_id

  function get_plan_execution_time(helper, from, to, base_dtype, base_storage, datatype_id, transpose_name_id, a, b) result(elapsed_time)
  !! Creates transpose plan for backend `DTFFT_BACKEND_MPI_DATATYPE` and executes it `DTFFT_MEASURE_WARMUP_ITERS` + `DTFFT_MEASURE_ITERS` times
  !!
  !! Returns elapsed time
    type(backend_helper),         intent(inout) :: helper               !! Backend helper
    type(pencil),                 intent(in)    :: from                 !! Source meta
    type(pencil),                 intent(in)    :: to                   !! Target meta
    TYPE_MPI_DATATYPE,            intent(in)    :: base_dtype           !! Basic MPI Datatype
    integer(int64),               intent(in)    :: base_storage         !! Number of bytes needed to store Basic MPI Datatype
    integer(int8),                intent(in)    :: datatype_id          !! ID of transpose (1 or 2)
    integer(int8),                intent(in)    :: transpose_name_id    !! ID of transpose name (from -3 to 3, except 0)
    real(real32),                 intent(inout) :: a(:)                 !! Source buffer
    real(real32),                 intent(inout) :: b(:)                 !! Target buffer
    real(real32)                                :: elapsed_time         !! Execution time [ms]
    real(real32)                                :: time                 !! Timer
    character(len=:),             allocatable   :: phase_name           !! Caliper phase name
    type(transpose_handle_datatype)             :: plan                 !! Transpose plan
    real(real64)                                :: ts, te               !! Timers
    integer(int32)                              :: iter                 !! Counter
    integer(int32)                              :: ierr                 !! Error code
    type(create_args)   :: create_kwargs
    type(execute_args) :: exec_kwargs
    integer(int32) :: n_iters

    allocate( phase_name, source="  Testing plan "//TRANSPOSE_NAMES(transpose_name_id)//", datatype_id = "//to_str(datatype_id) )
    REGION_BEGIN(phase_name, 0)
    WRITE_INFO(phase_name)

    create_kwargs%base_type = base_dtype
    create_kwargs%datatype_id = datatype_id
    create_kwargs%helper = helper

    call plan%create(from, to, base_storage, create_kwargs)

    exec_kwargs%exec_type = EXEC_BLOCKING

    do iter = 1, get_conf_measure_warmup_iters()
      call plan%execute(a, b, exec_kwargs, ierr)
    enddo
    call MPI_Barrier(helper%comms(1), ierr)
    n_iters = get_conf_measure_iters()
    ts = MPI_Wtime()
    do iter = 1, n_iters
      call plan%execute(a, b, exec_kwargs, ierr)
    enddo
    te = MPI_Wtime()

    time = real(te - ts, real32) * 1000._real32
    elapsed_time = report_timings(helper%comms(1), time, n_iters, 4)

    call plan%destroy()
    REGION_END(phase_name)
    deallocate(phase_name)
  end function get_plan_execution_time

  function report_timings(comm, elapsed_time, n_iters, space_count) result(max_time)
    TYPE_MPI_COMM,            intent(in)  :: comm
    real(real32),             intent(in)  :: elapsed_time
    integer(int32),           intent(in)  :: n_iters
    integer(int32), optional, intent(in)  :: space_count
    real(real32)                          :: max_time
    real(real32) :: execution_time, min_time, avg_time
    integer(int32) :: ierr, comm_size, space_count_

    execution_time = elapsed_time / real(n_iters, real32)
    space_count_ = 2; if ( present(space_count) ) space_count_ = space_count

    call MPI_Allreduce(execution_time, min_time, 1, MPI_REAL4, MPI_MIN, comm, ierr)
    call MPI_Allreduce(execution_time, max_time, 1, MPI_REAL4, MPI_MAX, comm, ierr)
    call MPI_Allreduce(execution_time, avg_time, 1, MPI_REAL4, MPI_SUM, comm, ierr)
    call MPI_Comm_size(comm, comm_size, ierr)

    avg_time = avg_time / real(comm_size, real32)

    WRITE_INFO(repeat(" ", space_count_)//"max: "//to_str(real(max_time, real64))//" [ms]")
    WRITE_INFO(repeat(" ", space_count_)//"min: "//to_str(real(min_time, real64))//" [ms]")
    WRITE_INFO(repeat(" ", space_count_)//"avg: "//to_str(real(avg_time, real64))//" [ms]")
  end function report_timings

  function get_aux_size(self) result(aux_size)
  !! Returns maximum auxiliary memory size needed by transpose plan
    class(transpose_plan), intent(in)    :: self  !! Transposition class
    integer(int64) :: aux_size

    aux_size = get_aux_size_generic(self%plans)
  end function get_aux_size

  function get_aux_size_generic(plans) result(aux_size)
  !! Returns maximum auxiliary memory size needed by plans
    type(plan_t),   intent(in)  :: plans(:)       !! Transpose plans
    integer(int64)              :: aux_size       !! Maximum auxiliary memory size needed
    integer(int64), allocatable :: worksizes(:)   !! Auxiliary memory sizes needed by each plan
    integer(int32)  :: n_transpose_plans, i

    n_transpose_plans = size(plans)
    allocate( worksizes( n_transpose_plans ), source=0_int64 )

    do i = 1, n_transpose_plans
      if ( allocated( plans(i)%p ) ) then
        worksizes(i) = plans(i)%p%get_aux_size()
      endif
    enddo
    aux_size = maxval(worksizes)
    deallocate(worksizes)
  end function get_aux_size_generic

  subroutine alloc_and_set_aux(platform, helper, backend, cart_comm, aux, paux, plans, is_aux_alloc)
  !! Allocates auxiliary memory according to the backend and sets it to the plans
    type(dtfft_platform_t),       intent(in)                :: platform
    type(backend_helper),         intent(inout)             :: helper       !! Backend helper
    type(dtfft_backend_t),        intent(in)                :: backend      !! GPU backend
    TYPE_MPI_COMM,                intent(in)                :: cart_comm    !! Cartesian communicator
    type(c_ptr),                  intent(inout)             :: aux          !! Allocatable auxiliary memory
    real(real32),     pointer,    intent(inout)             :: paux(:)      !! Pointer to auxiliary memory
    type(plan_t),                 intent(in)                :: plans(:)
    logical                                                 :: is_aux_alloc !! Is auxiliary memory allocated
    integer(int64) :: max_work_size_local, max_work_size_global
    integer(int32)  :: mpi_ierr
    integer(int32) :: alloc_ierr

    max_work_size_local = get_aux_size_generic(plans)
    call MPI_Allreduce(max_work_size_local, max_work_size_global, 1, MPI_INTEGER8, MPI_MAX, cart_comm, mpi_ierr)

    is_aux_alloc = .false.
    paux => null()
    if ( max_work_size_global > 0 ) then
      call alloc_mem(platform, helper, backend, cart_comm, max_work_size_global, aux, alloc_ierr)
      DTFFT_CHECK(alloc_ierr)
      call c_f_pointer(aux, paux, [max_work_size_global / 4_int64])
      is_aux_alloc = .true.
    endif
  end subroutine alloc_and_set_aux

  subroutine create_pencils_and_comm(transposed_dims, old_comm, comm_dims, comm, local_comms, pencils, ipencil)
  !! Creates cartesian communicator
    integer(int32),       intent(in)            :: transposed_dims(:,:) !! Global counts in transposed coordinates
    TYPE_MPI_COMM,        intent(in)            :: old_comm             !! Communicator to create cartesian from
    integer(int32),       intent(in)            :: comm_dims(:)         !! Dims in cartesian communicator
    TYPE_MPI_COMM,        intent(out)           :: comm                 !! Cartesian communicator
    TYPE_MPI_COMM,        intent(out)           :: local_comms(:)       !! 1d communicators in cartesian communicator
    type(pencil),         intent(out)           :: pencils(:)           !! Data distributing meta
    type(pencil_init),    intent(in), optional  :: ipencil              !! Pencil passed by user
    integer(int8)         :: ndims              !! Number of dimensions
    integer(int8)         :: d                  !! Counter
    integer(int8)         :: order(3)

    ndims = size(comm_dims, kind=int8)
    order = [1_int8, 3_int8, 2_int8]

    call create_cart_comm(old_comm, comm_dims, comm, local_comms, ipencil=ipencil)
    if ( present(ipencil) ) then
      block
        integer(int32), allocatable :: lstarts(:), lcounts(:)

        allocate(lstarts, source=ipencil%starts)
        allocate(lcounts, source=ipencil%counts)
        do d = 1, ndims
          if( ndims == 3 .and. d == 2 ) then
            call pencils(d)%create(ndims, d, transposed_dims(:,d), local_comms, lstarts=lstarts, lcounts=lcounts, order=order)
          else
            call pencils(d)%create(ndims, d, transposed_dims(:,d), local_comms, lstarts=lstarts, lcounts=lcounts)
          endif
          lcounts(:) = pencils(d)%counts(:)
          lstarts(:) = pencils(d)%starts(:)
        enddo

        deallocate(lstarts, lcounts)
      endblock
    else
      do d = 1, ndims
        if( ndims == 3 .and. d == 2 ) then
          call pencils(d)%create(ndims, d, transposed_dims(:,d), local_comms, order=order)
        else
          call pencils(d)%create(ndims, d, transposed_dims(:,d), local_comms)
        endif
      enddo
    endif
  end subroutine create_pencils_and_comm

  subroutine create_cart_comm(old_comm, comm_dims, comm, local_comms, ipencil)
  !! Creates cartesian communicator
    TYPE_MPI_COMM,        intent(in)            :: old_comm             !! Communicator to create cartesian from
    integer(int32),       intent(in)            :: comm_dims(:)         !! Dims in cartesian communicator
    TYPE_MPI_COMM,        intent(out)           :: comm                 !! Cartesian communicator
    TYPE_MPI_COMM,        intent(out)           :: local_comms(:)       !! 1d communicators in cartesian communicator
    type(pencil_init),    intent(in), optional  :: ipencil              !! Pencil passed by user
    logical,              allocatable   :: periods(:)           !! Grid is not periodic
    logical,              allocatable   :: remain_dims(:)       !! Needed by MPI_Cart_sub
    integer(int8)                       :: dim                  !! Counter
    integer(int32)                      :: ierr                 !! Error code
    integer(int8)                       :: ndims
    TYPE_MPI_COMM              :: temp_cart_comm
    TYPE_MPI_COMM, allocatable :: temp_comms(:)

    ndims = size(comm_dims, kind=int8)

    if ( present( ipencil ) ) then
      call MPI_Comm_dup(old_comm, comm, ierr)
      do dim = 1, ndims
        call MPI_Comm_dup(ipencil%comms(dim), local_comms(dim), ierr)
      enddo
      return
    endif
    allocate(periods(ndims), source = .false.)

    call MPI_Cart_create(old_comm, int(ndims, int32), comm_dims, periods, .true., temp_cart_comm, ierr)
    call create_subcomm_include_all(temp_cart_comm, comm)
    if ( GET_MPI_VALUE(comm) == GET_MPI_VALUE(MPI_COMM_NULL) ) INTERNAL_ERROR("comm == MPI_COMM_NULL")

    allocate(temp_comms(ndims))

    allocate( remain_dims(ndims), source = .false. )
    do dim = 1, ndims
      remain_dims(dim) = .true.
      call MPI_Cart_sub(temp_cart_comm, remain_dims, temp_comms(dim), ierr)
      call create_subcomm_include_all(temp_comms(dim), local_comms(dim))
      if ( GET_MPI_VALUE(local_comms(dim)) == GET_MPI_VALUE(MPI_COMM_NULL) ) INTERNAL_ERROR("local_comms(dim) == MPI_COMM_NULL: dim = "//to_str(dim))
      remain_dims(dim) = .false.
    enddo
    call MPI_Comm_free(temp_cart_comm, ierr)
    do dim = 1, ndims
      call MPI_Comm_free(temp_comms(dim), ierr)
    enddo
    deallocate(temp_comms)
    deallocate(remain_dims, periods)
  end subroutine create_cart_comm

  type(dtfft_backend_t) function get_backend(self)
  !! Returns plan GPU backend
    class(transpose_plan), intent(in)    :: self           !! Transposition class
    get_backend = self%backend
  end function get_backend

  subroutine mem_alloc(self, comm, alloc_bytes, ptr, error_code)
  !! Allocates memory based on selected backend
    class(transpose_plan),  intent(inout) :: self           !! Transposition class
    TYPE_MPI_COMM,          intent(in)    :: comm           !! MPI communicator
    integer(int64),         intent(in)    :: alloc_bytes    !! Number of bytes to allocate
    type(c_ptr),            intent(out)   :: ptr            !! Pointer to the allocated memory
    integer(int32),         intent(out)   :: error_code     !! Error code

    call alloc_mem(self%platform, self%helper, self%backend, comm, alloc_bytes, ptr, error_code)
  end subroutine mem_alloc

  subroutine mem_free(self, ptr, error_code)
  !! Frees memory allocated with mem_alloc
    class(transpose_plan),  intent(inout) :: self           !! Transposition class
    type(c_ptr),            intent(in)    :: ptr            !! Pointer to the memory to free
    integer(int32),         intent(out)   :: error_code     !! Error code

    call free_mem(self%platform, self%helper, self%backend, ptr, error_code)
  end subroutine mem_free

  subroutine alloc_mem(platform, helper, backend, comm, alloc_bytes, ptr, error_code)
  !! Allocates memory based on ``backend``
    type(dtfft_platform_t), intent(in)    :: platform
    type(backend_helper),   intent(inout) :: helper         !! Backend helper
    type(dtfft_backend_t),  intent(in)    :: backend        !! GPU backend
    TYPE_MPI_COMM,          intent(in)    :: comm           !! MPI communicator
    integer(int64),         intent(in)    :: alloc_bytes    !! Number of bytes to allocate
    type(c_ptr),            intent(out)   :: ptr            !! Pointer to the allocated memory
    integer(int32),         intent(out)   :: error_code     !! Error code
#ifdef DTFFT_WITH_CUDA
    integer(int64)  :: free_mem_avail, total_mem_avail
#endif

    error_code = DTFFT_SUCCESS
    if ( platform == DTFFT_PLATFORM_HOST ) then
      ptr = mem_alloc_host(alloc_bytes)
#ifdef DTFFT_WITH_CUDA
    else
      CUDA_CALL( cudaMemGetInfo(free_mem_avail, total_mem_avail) )
# ifdef DTFFT_DEBUG
      block
        integer(int64) :: min_mem, max_mem, min_free_mem, max_free_mem
        integer(int32) :: mpi_err

        call MPI_Allreduce(alloc_bytes, max_mem, 1, MPI_INTEGER8, MPI_MAX, comm, mpi_err)
        call MPI_Allreduce(alloc_bytes, min_mem, 1, MPI_INTEGER8, MPI_MIN, comm, mpi_err)
        call MPI_Allreduce(free_mem_avail, max_free_mem, 1, MPI_INTEGER8, MPI_MAX, comm, mpi_err)
        call MPI_Allreduce(free_mem_avail, min_free_mem, 1, MPI_INTEGER8, MPI_MIN, comm, mpi_err)
        WRITE_DEBUG("Trying to allocate "//to_str(min_mem)//"/"//to_str(max_mem)//" (min/max) bytes for backend: '"//dtfft_get_backend_string(backend)//"'")
        WRITE_DEBUG("Free memory available: "//to_str(min_free_mem)//"/"//to_str(max_free_mem)//" (min/max) bytes")
      endblock
# endif
      if ( alloc_bytes > free_mem_avail ) then
        error_code = DTFFT_ERROR_ALLOC_FAILED
        return
      endif
      if ( is_backend_nccl(backend) ) then
# ifdef DTFFT_WITH_NCCL
#   ifdef NCCL_HAVE_MEMALLOC
        error_code = ncclMemAlloc(ptr, alloc_bytes)
#   else
        error_code = cudaMalloc(ptr, alloc_bytes)
#   endif
#   ifdef NCCL_HAVE_COMMREGISTER
        if ( error_code == cudaSuccess .and. helper%should_register ) then
          block
            type(c_ptr), allocatable :: temp(:,:)
            type(c_ptr) :: handle

            if ( size(helper%nccl_register, dim=2) == helper%nccl_register_size ) then
              allocate( temp(2, helper%nccl_register_size + NCCL_REGISTER_PREALLOC_SIZE) )
              temp(2, 1:helper%nccl_register_size) = helper%nccl_register(2, 1:helper%nccl_register_size)
              deallocate( helper%nccl_register )
              call move_alloc(temp, helper%nccl_register)
            endif
            helper%nccl_register_size = helper%nccl_register_size + 1

            NCCL_CALL( ncclCommRegister(helper%nccl_comm, ptr, alloc_bytes, handle) )
            helper%nccl_register(1, helper%nccl_register_size) = ptr
            helper%nccl_register(2, helper%nccl_register_size) = handle
            WRITE_DEBUG("Registered pointer "//to_str(transfer(ptr, int64)))
          endblock
        endif
#   endif
# else
        INTERNAL_ERROR("not DTFFT_WITH_NCCL")
# endif
    else if ( is_backend_nvshmem(backend) ) then
# ifdef DTFFT_WITH_NVSHMEM
      block
        integer(int64)  :: max_alloc_bytes
        integer(int32)  :: mpi_err
        call MPI_Allreduce(alloc_bytes, max_alloc_bytes, 1, MPI_INTEGER8, MPI_MAX, comm, mpi_err)
        ptr = nvshmem_malloc(max_alloc_bytes)
        if ( is_null_ptr(ptr) ) error_code = DTFFT_ERROR_ALLOC_FAILED
      endblock
# else
      INTERNAL_ERROR("not DTFFT_WITH_NVSHMEM")
# endif
    else
      error_code = cudaMalloc(ptr, alloc_bytes)
    endif
#endif
    endif
    if ( error_code /= DTFFT_SUCCESS ) error_code = DTFFT_ERROR_ALLOC_FAILED
  end subroutine alloc_mem

  subroutine free_mem(platform, helper, backend, ptr, error_code)
  !! Frees memory based on ``backend``
    type(dtfft_platform_t),         intent(in)    :: platform
    type(backend_helper),           intent(inout) :: helper         !! Backend helper
    type(dtfft_backend_t),          intent(in)    :: backend        !! GPU backend
    type(c_ptr),                    intent(in)    :: ptr            !! Pointer to the memory to free
    integer(int32),                 intent(out)   :: error_code     !! Error code

    error_code = DTFFT_SUCCESS
    if ( platform == DTFFT_PLATFORM_HOST ) then
      call mem_free_host(ptr)
#ifdef DTFFT_WITH_CUDA
    else
      if ( is_backend_nccl(backend) ) then
# ifdef NCCL_HAVE_COMMREGISTER
      if ( helper%should_register ) then
        block
          integer(int32) :: i

          do i = 1, size(helper%nccl_register, dim=2)
            if ( .not. is_same_ptr(ptr, helper%nccl_register(1, i)) ) cycle
            NCCL_CALL( ncclCommDeregister(helper%nccl_comm, helper%nccl_register(2, i)) )
            helper%nccl_register(1, i) = c_null_ptr
            helper%nccl_register(2, i) = c_null_ptr
            helper%nccl_register_size = helper%nccl_register_size - 1
            WRITE_DEBUG("Pointer "//to_str(transfer(ptr, int64))//" has been removed from registry")
          enddo
        endblock
      endif
# endif
# ifdef DTFFT_WITH_NCCL
#   ifdef NCCL_HAVE_MEMALLOC
      error_code = ncclMemFree(ptr)
#   else
      error_code = cudaFree(ptr)
#   endif
# else
      INTERNAL_ERROR("not DTFFT_WITH_NCCL")
# endif
    else if ( is_backend_nvshmem(backend) ) then
# ifdef DTFFT_WITH_NVSHMEM
      call nvshmem_free(ptr)
# else
      INTERNAL_ERROR("not DTFFT_WITH_NVSHMEM")
# endif
    else
      error_code = cudaFree(ptr)
    endif
#endif
  endif
    if ( error_code /= DTFFT_SUCCESS ) error_code = DTFFT_ERROR_FREE_FAILED
  end subroutine free_mem
end module dtfft_transpose_plan