dtfft_transpose_plan_host.F90 Source File


This file depends on

sourcefile~~dtfft_transpose_plan_host.f90~~EfferentGraph sourcefile~dtfft_transpose_plan_host.f90 dtfft_transpose_plan_host.F90 sourcefile~dtfft_abstract_transpose_plan.f90 dtfft_abstract_transpose_plan.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_host.f90 dtfft_transpose_handle_host.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_transpose_handle_host.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nccl.f90 dtfft_interface_nccl.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_interface_nvshmem.f90 dtfft_interface_nvshmem.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_nvrtc_kernel.f90 dtfft_nvrtc_kernel.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_interface_nvrtc.f90 dtfft_interface_nvrtc.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_interface_cuda_runtime.f90

Files dependent on this one

sourcefile~~dtfft_transpose_plan_host.f90~~AfferentGraph sourcefile~dtfft_transpose_plan_host.f90 dtfft_transpose_plan_host.F90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_host.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_transpose_plan_host
!! This module describes [[transpose_plan_host]] class
use iso_fortran_env,                only: int8, int32, int64, real32, real64, output_unit
use dtfft_abstract_transpose_plan,  only: abstract_transpose_plan, create_cart_comm
use dtfft_pencil,                   only: pencil, get_local_sizes
use dtfft_parameters
use dtfft_transpose_handle_host,    only: transpose_handle_host
use dtfft_utils
#include "dtfft_mpi.h"
#include "dtfft_profile.h"
#include "dtfft_private.h"
implicit none
private
public :: transpose_plan_host

  integer(int8), save :: FORWARD_PLAN_IDS(3)
  integer(int8), save :: BACKWARD_PLAN_IDS(3)


  type, extends(abstract_transpose_plan) :: transpose_plan_host
  !! Host transpose plan
  private
    type(transpose_handle_host), allocatable :: fplans(:)   !! Forward plans
    type(transpose_handle_host), allocatable :: bplans(:)   !! Backward plans
  contains
  ! private
    procedure :: create_private
    procedure :: execute_private
    procedure :: destroy
    procedure, nopass,      private :: get_plan_execution_time
    procedure, pass(self),  private :: autotune_transpose_id
    procedure, pass(self),  private :: autotune_mpi_datatypes
    procedure, pass(self),  private :: autotune_grid_decomposition
    procedure, pass(self),  private :: autotune_grid
  end type transpose_plan_host

contains

  function create_private(self, dims, transposed_dims, base_comm, comm_dims, effort, base_dtype, base_storage, is_custom_cart_comm, cart_comm, comms, pencils) result(error_code)
  !! Creates transposition plans
    class(transpose_plan_host),     intent(inout) :: self                 !! Transposition class
    integer(int32),                 intent(in)    :: dims(:)              !! Global sizes of the transform requested
    integer(int32),                 intent(in)    :: transposed_dims(:,:) !! Transposed sizes of the transform requested
    TYPE_MPI_COMM,                  intent(in)    :: base_comm            !! Base communicator
    integer(int32),                 intent(in)    :: comm_dims(:)         !! Number of MPI Processes in all directions
    type(dtfft_effort_t),           intent(in)    :: effort               !! How thoroughly `dtFFT` searches for the optimal plan
    TYPE_MPI_DATATYPE,              intent(in)    :: base_dtype           !! Base MPI_Datatype
    integer(int64),                 intent(in)    :: base_storage         !! Number of bytes needed to store single element
    logical,                        intent(in)    :: is_custom_cart_comm  !! Is custom Cartesian communicator provided by user
    TYPE_MPI_COMM,                  intent(out)   :: cart_comm            !! Cartesian communicator
    TYPE_MPI_COMM,                  intent(out)   :: comms(:)             !! Array of 1d communicators
    type(pencil),                   intent(out)   :: pencils(:)           !! Pencils
    integer(int32)                                :: error_code           !! Error code
    integer(int32),                 allocatable   :: best_comm_dims(:)
    integer(int8) :: d, ndims, n_transpose_plans
    integer(int8), allocatable :: best_forward_ids(:), best_backward_ids(:)
    integer(int32) :: comm_size, ierr

    FORWARD_PLAN_IDS(1) = get_datatype_from_env("DTYPE_X_Y"); BACKWARD_PLAN_IDS(1) = get_datatype_from_env("DTYPE_Y_X")
    FORWARD_PLAN_IDS(2) = get_datatype_from_env("DTYPE_Y_Z"); BACKWARD_PLAN_IDS(2) = get_datatype_from_env("DTYPE_Z_Y")
    FORWARD_PLAN_IDS(3) = get_datatype_from_env("DTYPE_X_Z"); BACKWARD_PLAN_IDS(3) = get_datatype_from_env("DTYPE_Z_X")

    ndims = size(dims, kind=int8)

    allocate( best_comm_dims, source=comm_dims )

    n_transpose_plans = ndims - 1_int8; if( self%is_z_slab ) n_transpose_plans = n_transpose_plans + 1_int8
    allocate( best_forward_ids(n_transpose_plans), best_backward_ids(n_transpose_plans) )
    ! Setting default values
    ! Values are defined during compilation
    do d = 1, n_transpose_plans
      best_forward_ids(d) = FORWARD_PLAN_IDS(d)
      best_backward_ids(d) = BACKWARD_PLAN_IDS(d)
    enddo

    call MPI_Comm_size(base_comm, comm_size, ierr)
    ! With custom cart comm we can only search for best Datatypes
    ! only if effort == DTFFT_PATIENT
    if ( (  is_custom_cart_comm                                 &
            .or. comm_size == 1                                 &
            .or. ndims == 2_int8                                &
            .or. self%is_z_slab                                 &
          ) .and. effort == DTFFT_PATIENT ) then
      block
        integer(int32) :: dummy
        real(real64) :: dummy_timer(1)
        integer(int32) :: dummy_decomp(size(comm_dims), 1)
        integer(int8)  :: forw_ids(n_transpose_plans, 1), back_ids(n_transpose_plans, 1)

        dummy = 1
        forw_ids(:, 1) = best_forward_ids(:)
        back_ids(:, 1) = best_backward_ids(:)
        call self%autotune_grid(                                &
          base_comm, comm_dims,                                 &
          dims, transposed_dims,                                &
          effort, base_dtype, base_storage,                &
          dummy, dummy_timer, dummy_decomp, forw_ids, back_ids  &
        )
        best_forward_ids(:) = forw_ids(:, 1)
        best_backward_ids(:) = back_ids(:, 1)
      endblock
    else if ( ndims == 3                                        &
              .and. .not.is_custom_cart_comm                    &
              .and. .not.self%is_z_slab                         &
              .and. effort%val >= DTFFT_MEASURE%val        &
              .and. comm_size > 1 ) then
      call self%autotune_grid_decomposition(                    &
        dims, transposed_dims, base_comm,                       &
        effort, n_transpose_plans,                         &
        base_dtype, base_storage,                               &
        best_comm_dims, best_forward_ids, best_backward_ids     &
      )
    endif

    if ( effort == DTFFT_PATIENT ) then
      WRITE_INFO(repeat("*", 50))
      WRITE_INFO("DTFFT_PATIENT: Selected transpose ids:")
      do d = 1, n_transpose_plans
        WRITE_INFO("    "//TRANSPOSE_NAMES( d)//": "//int_to_str( best_forward_ids(d) ))
        WRITE_INFO("    "//TRANSPOSE_NAMES(-d)//": "//int_to_str( best_backward_ids(d) ))
      enddo
      WRITE_INFO(repeat("*", 50))
    endif

    call create_cart_comm(base_comm, best_comm_dims, cart_comm, comms)
    do d = 1, ndims
      call pencils(d)%create(ndims, d, transposed_dims(:,d), comms)
    enddo

    allocate( self%fplans(n_transpose_plans), self%bplans(n_transpose_plans) )

    do d = 1_int8, ndims - 1_int8
      call self%fplans(d)%create(comms(d + 1), pencils(d), pencils(d + 1), base_dtype, base_storage, best_forward_ids(d))
      call self%bplans(d)%create(comms(d + 1), pencils(d + 1), pencils(d), base_dtype, base_storage, best_backward_ids(d))
    enddo
    if ( self%is_z_slab ) then
      call self%fplans(3)%create(cart_comm, pencils(1), pencils(3), base_dtype, base_storage, best_forward_ids(3))
      call self%bplans(3)%create(cart_comm, pencils(3), pencils(1), base_dtype, base_storage, best_backward_ids(3))
    endif

#ifdef DTFFT_WITH_CUDA
    self%backend = DTFFT_BACKEND_MPI_DATATYPE
#endif

    deallocate(best_comm_dims)
    deallocate(best_forward_ids, best_backward_ids)
    error_code = DTFFT_SUCCESS
  end function create_private

  subroutine execute_private(self, in, out, transpose_type)
  !! Executes single transposition
    class(transpose_plan_host),   intent(inout) :: self         !! Transposition class
    real(real32),                 intent(inout) :: in(:)          !! Incoming buffer
    real(real32),                 intent(inout) :: out(:)         !! Resulting buffer
    type(dtfft_transpose_t),      intent(in)    :: transpose_type !! Type of transpose to execute

    if ( transpose_type%val > 0 ) then
      call self%fplans(transpose_type%val)%execute(in, out)
    else
      call self%bplans(abs(transpose_type%val))%execute(in, out)
    endif
  end subroutine execute_private

  subroutine destroy(self)
  !! Destroys transposition plans
    class(transpose_plan_host),     intent(inout) :: self         !! Transposition class
    integer(int8) :: i

    do i = 1, size(self%bplans, kind=int8)
      call self%fplans(i)%destroy()
      call self% bplans(i)%destroy()
    enddo
    deallocate(self%fplans)
    deallocate(self% bplans)
  end subroutine destroy

  subroutine autotune_grid_decomposition(self, dims, transposed_dims, base_comm, effort, n_transpose_plans, base_dtype, base_storage, best_comm_dims, best_forward_ids, best_backward_ids)
  !! Runs through all possible grid decompositions and selects the best one based on the lowest average execution time
    class(transpose_plan_host),   intent(in)    :: self                 !! Abstract plan
    integer(int32),               intent(in)    :: dims(:)              !! Global dims
    integer(int32),               intent(in)    :: transposed_dims(:,:) !! Transposed dims
    TYPE_MPI_COMM,                intent(in)    :: base_comm            !! Base communicator
    type(dtfft_effort_t),         intent(in)    :: effort               !! How thoroughly `dtFFT` searches for the optimal plan
    integer(int8),                intent(in)    :: n_transpose_plans    !! Number of transpose plans to test
    TYPE_MPI_DATATYPE,            intent(in)    :: base_dtype           !! Base MPI_Datatype
    integer(int64),               intent(in)    :: base_storage         !! Number of bytes needed to store single element
    integer(int32),               intent(out)   :: best_comm_dims(:)    !! Best communicator dimensions
    integer(int8),                intent(inout) :: best_forward_ids(:)  !! Best Datatype ids for forward plan
    integer(int8),                intent(inout) :: best_backward_ids(:) !! Best Datatype ids for backward plan
    integer(int8)   :: ndims
    integer(int32)  :: comm_size, square_root, i, current_timer, k, ierr
    real(real64)    :: min_time
    real(real64),     allocatable :: timers(:)
    integer(int32),   allocatable :: decomps(:,:)
    integer(int8),    allocatable :: forw_ids(:,:), back_ids(:,:)
    real(real64),    parameter :: MaxR8P  =  huge(1._real64)

    call MPI_Comm_size(base_comm, comm_size, ierr)
    ndims = size(dims, kind=int8)

    square_root = int(sqrt(real(comm_size, real64))) + 1
    allocate(timers(2 * square_root))
    allocate(decomps(2, 2 * square_root))
    allocate(forw_ids(n_transpose_plans, 2 * square_root))
    allocate(back_ids(n_transpose_plans, 2 * square_root))

    do i = 1, 2 * square_root
      do k = 1, n_transpose_plans
        forw_ids(k, i) = FORWARD_PLAN_IDS(k)
        back_ids(k, i) = BACKWARD_PLAN_IDS(k)
      enddo
    enddo

    current_timer = 1
    do i = 1, square_root - 1
      if ( mod( comm_size, i ) /= 0 ) cycle

      call self%autotune_grid(base_comm, [1, i, comm_size / i], dims, transposed_dims, effort, base_dtype, base_storage, current_timer, timers, decomps, forw_ids, back_ids)
      if ( i /= comm_size / i) then
        call self%autotune_grid(base_comm, [1, comm_size / i, i], dims, transposed_dims, effort, base_dtype, base_storage, current_timer, timers, decomps, forw_ids, back_ids)
      endif
    enddo

    min_time = MaxR8P
    k = 1
    do i = 1, current_timer - 1
      if ( timers(i) < min_time ) then
        min_time = timers(i)
        k = i
      endif
    enddo

    best_comm_dims(1) = 1
    best_comm_dims(2) = decomps(1, k)
    best_comm_dims(3) = decomps(2, k)
    WRITE_INFO(repeat("*", 50))
    WRITE_INFO("DTFFT_MEASURE: Selected MPI grid 1x"//int_to_str(best_comm_dims(2))//"x"//int_to_str(best_comm_dims(3)))
    if ( effort == DTFFT_PATIENT ) then
      best_forward_ids(:) = forw_ids(:, k)
      best_backward_ids(:) = back_ids(:, k)
    else
      WRITE_INFO(repeat("*", 50))
    endif

    deallocate(timers, decomps, forw_ids, back_ids)
  end subroutine autotune_grid_decomposition

  subroutine autotune_grid(self, base_comm, comm_dims, dims, transposed_dims, effort, base_dtype, base_storage, latest_timer_id, timers, decomps, forw_ids, back_ids)
  !! Creates cartesian communicator and executes various datatypes on it
    class(transpose_plan_host),   intent(in)    :: self                 !! Abstract plan
    TYPE_MPI_COMM,                intent(in)    :: base_comm            !! Base communicator
    integer(int32),               intent(in)    :: comm_dims(:)         !! Number of MPI Processes in Y and Z directions
    integer(int32),               intent(in)    :: dims(:)              !! Global dims
    integer(int32),               intent(in)    :: transposed_dims(:,:) !! Transposed dims
    type(dtfft_effort_t),         intent(in)    :: effort               !! How thoroughly `dtFFT` searches for the optimal plan
    TYPE_MPI_DATATYPE,            intent(in)    :: base_dtype           !! Basic MPI Datatype
    integer(int64),               intent(in)    :: base_storage         !! Number of bytes needed to store Basic MPI Datatype
    integer(int32),               intent(inout) :: latest_timer_id      !! Current timer id
    real(real64),                 intent(inout) :: timers(:)            !! Time of current function execution is stored in timers(latest_timer_id)
    integer(int32),               intent(inout) :: decomps(:,:)         !! Current decomposition is stored in decomps(:, latest_timer_id)
    integer(int8),                intent(inout) :: forw_ids(:,:)        !! Best Forward ids are stored in forw_ids(:, latest_timer_id)
    integer(int8),                intent(inout) :: back_ids(:,:)        !! Best Backward ids are stored in back_ids(:, latest_timer_id)
    character(len=:),             allocatable   :: phase_name           !! Caliper phase name
    integer(int32)          :: ierr
    integer(int8)           :: d
    type(pencil), allocatable :: pencils(:)
    TYPE_MPI_COMM           :: comm
    real(real64)            :: tf, tb
    TYPE_MPI_COMM,  allocatable           :: comms(:)
    real(real32), allocatable :: a(:), b(:)
    integer(int64)         :: alloc_size
    integer(int8) :: ndims

    ndims = size(comm_dims, kind=int8)

    if ( ndims == 3 ) then
      if ( comm_dims(2) > dims(2) .or. comm_dims(3) > dims(3) ) return
      allocate( phase_name, source = "Testing grid 1x"//int_to_str(comm_dims(2))//"x"//int_to_str(comm_dims(3)) )
    else
      allocate( phase_name, source = "Testing grid 1x"//int_to_str(comm_dims(2)) )
    endif

    WRITE_INFO(repeat("=", 50))
    WRITE_INFO(phase_name)
    PHASE_BEGIN(phase_name, 0)

    ! comm_dims(1) = 1
    ! comm_dims(2) = ny
    ! comm_dims(3) = nz

    allocate( pencils(ndims) )
    allocate( comms(ndims) )

    call create_cart_comm(base_comm, comm_dims, comm, comms)
    do d = 1, ndims
      call pencils(d)%create(ndims, d, transposed_dims(:,d), comms)
    enddo

    call get_local_sizes(pencils, alloc_size=alloc_size)
    alloc_size = alloc_size * base_storage / FLOAT_STORAGE_SIZE

    allocate(a(alloc_size))
    allocate(b(alloc_size))

    if ( effort == DTFFT_PATIENT ) then
      call self%autotune_mpi_datatypes(pencils, comm, comms, base_dtype, base_storage, a, b, forw_ids(:, latest_timer_id), back_ids(:, latest_timer_id), timers(latest_timer_id))
      WRITE_INFO("Execution time on a grid using fastest transpositions: "//double_to_str(timers(latest_timer_id)))
    else
      timers(latest_timer_id) = 0.0_real64
      do d = 1_int8, ndims - 1_int8
        tf = self%get_plan_execution_time(comms(d + 1), comm, pencils(d), pencils(d + 1), base_dtype, base_storage, FORWARD_PLAN_IDS(d), d, a, b)
        tb = self%get_plan_execution_time(comms(d + 1), comm, pencils(d + 1), pencils(d), base_dtype, base_storage, BACKWARD_PLAN_IDS(d), d, a, b)
        timers(latest_timer_id) = timers(latest_timer_id) + tf + tb
      enddo
      WRITE_INFO("Average execution time on a grid: "//double_to_str(timers(latest_timer_id)))
    endif
    do d = 1_int8, ndims - 1_int8
      decomps(d, latest_timer_id) = comm_dims(d + 1)
      ! decomps(1, latest_timer_id) = comm_dims(2)
      ! decomps(2, latest_timer_id) = comm_dims(3)
    enddo

    WRITE_INFO(repeat("=", 50))
    latest_timer_id = latest_timer_id + 1

    deallocate(a, b)
    do d = 1, ndims
      call pencils(d)%destroy()
      call MPI_Comm_free(comms(d), ierr)
    enddo
    call MPI_Comm_free(comm, ierr)
    deallocate( pencils, comms )
    PHASE_END(phase_name)
  end subroutine autotune_grid

  subroutine autotune_mpi_datatypes(self, pencils, cart_comm, comms, base_dtype, base_storage, a, b, forward_ids, backward_ids, elapsed_time)
  !! 
    class(transpose_plan_host), intent(in)    :: self               !! Host plan
    type(pencil),               intent(in)    :: pencils(:)         !! Array of pencils
    TYPE_MPI_COMM,              intent(in)    :: cart_comm          !! 3D Cartesian comm
    TYPE_MPI_COMM,              intent(in)    :: comms(:)           !! Array of 1d communicators
    TYPE_MPI_DATATYPE,          intent(in)    :: base_dtype         !! Basic MPI Datatype
    integer(int64),             intent(in)    :: base_storage       !! Number of bytes needed to store Basic MPI Datatype
    real(real32),               intent(inout) :: a(:)               !! Work buffer
    real(real32),               intent(inout) :: b(:)               !! Work buffer
    integer(int8),              intent(inout) :: forward_ids(:)     !! Forward plan IDs
    integer(int8),              intent(inout) :: backward_ids(:)    !! Backward plan IDs
    real(real64),               intent(out)   :: elapsed_time       !! Elapsed time
    integer(int8) :: dim        !! Counter
    integer(int8) :: ndims      !! Number of dimensions

    ndims = size(pencils, kind=int8)
    elapsed_time = 0._real64
    if( self%is_z_slab ) then
      elapsed_time = self%autotune_transpose_id(cart_comm, cart_comm, pencils(1), pencils(3), base_dtype, base_storage, 3_int8, a, b, forward_ids(3), backward_ids(3))
    else
      do dim = 1_int8, size(pencils, kind=int8) - 1_int8
        elapsed_time = elapsed_time + &
          self%autotune_transpose_id(comms(dim + 1), cart_comm, pencils(dim), pencils(dim + 1), base_dtype, base_storage, dim, a, b, forward_ids(dim), backward_ids(dim))
      enddo
    endif
  end subroutine autotune_mpi_datatypes

  function autotune_transpose_id(self, comm, cart_comm, from, to, base_dtype, base_storage, transpose_name_id, a, b, forward_id, backward_id) result(elapsed_time)
  !! Creates forward and backward transpose plans bases on source and target data distributing,
  !! executes them `DTFFT_MEASURE_ITERS` times ( 4 * `DTFFT_MEASURE_ITERS` iterations total )
  !!
  !! Returns elapsed time for best plans selected
    class(transpose_plan_host),   intent(in)    :: self                 !! Abstract plan
    TYPE_MPI_COMM,                intent(in)    :: comm                 !! 1D comm in case of pencils, 3D comm in case of z_slabs
    TYPE_MPI_COMM,                intent(in)    :: cart_comm            !! 3D Cartesian comm
    type(pencil),                 intent(in)    :: from                 !! Source meta
    type(pencil),                 intent(in)    :: to                   !! Target meta
    TYPE_MPI_DATATYPE,            intent(in)    :: base_dtype           !! Basic MPI Datatype
    integer(int64),               intent(in)    :: base_storage         !! Number of bytes needed to store Basic MPI Datatype
    integer(int8),                intent(in)    :: transpose_name_id    !! ID of transpose name (from -3 to 3, except 0)
    real(real32),                 intent(inout) :: a(:)                 !! Source buffer
    real(real32),                 intent(inout) :: b(:)                 !! Target buffer
    integer(int8),                intent(out)   :: forward_id           !! Best forward plan ID
    integer(int8),                intent(out)   :: backward_id          !! Best backward plan ID
    real(real64)                                :: elapsed_time         !! Elapsed time for best plans selected
    real(real64)                                :: forward_time         !! Forward plan execution time
    real(real64)                                :: backward_time        !! Backward plan execution time
    real(real64)                                :: time                 !! Timer
    integer(int8)                               :: datatype_id         !! Counter

    forward_time = huge(1._real64)
    backward_time = huge(1._real64)

    do datatype_id = 1, 2
      time = self%get_plan_execution_time(comm, cart_comm, from, to, base_dtype, base_storage, datatype_id, transpose_name_id, a, b)
      if ( time < forward_time ) then
        forward_time = time
        forward_id = datatype_id
      endif

      time = self%get_plan_execution_time(comm, cart_comm, to, from, base_dtype, base_storage, datatype_id, -1_int8 * transpose_name_id, a, b)
      if ( time < backward_time ) then
        backward_time = time
        backward_id = datatype_id
      endif
    enddo
    elapsed_time = forward_time + backward_time
  end function autotune_transpose_id

  function get_plan_execution_time(comm, cart_comm, from, to, base_dtype, base_storage, datatype_id, transpose_name_id, a, b) result(elapsed_time)
  !! Creates transpose plan and executes it `DTFFT_MEASURE_WARMUP_ITERS` + `DTFFT_MEASURE_ITERS` times
  !!
  !! Returns elapsed time
    TYPE_MPI_COMM,                intent(in)    :: comm                 !! 1D comm in case of pencils, 3D comm in case of z_slabs
    TYPE_MPI_COMM,                intent(in)    :: cart_comm            !! 3D Cartesian comm
    type(pencil),                 intent(in)    :: from                 !! Source meta
    type(pencil),                 intent(in)    :: to                   !! Target meta
    TYPE_MPI_DATATYPE,            intent(in)    :: base_dtype           !! Basic MPI Datatype
    integer(int64),               intent(in)    :: base_storage         !! Number of bytes needed to store Basic MPI Datatype
    integer(int8),                intent(in)    :: datatype_id          !! ID of transpose (1 or 2)
    integer(int8),                intent(in)    :: transpose_name_id    !! ID of transpose name (from -3 to 3, except 0)
    real(real32),                 intent(inout) :: a(:)                 !! Source buffer
    real(real32),                 intent(inout) :: b(:)                 !! Target buffer
    real(real64)                                :: elapsed_time         !! Execution time
    character(len=:),             allocatable   :: phase_name           !! Caliper phase name
    type(transpose_handle_host)                 :: plan                 !! Transpose plan
    real(real64)                                :: ts, te               !! Timers
    integer(int32)                              :: iter                 !! Counter
    integer(int32)                              :: ierr                 !! Error code
    integer(int32)                              :: comm_size            !! Size of ``cart_comm``
    integer(int32)                              :: n_warmup_iters
    integer(int32)                              :: n_iters

    allocate( phase_name, source="    Testing plan "//TRANSPOSE_NAMES(transpose_name_id)//", datatype_id = "//int_to_str(datatype_id) )
    PHASE_BEGIN(phase_name, 0)
    WRITE_INFO(phase_name)
    call plan%create(comm, from, to, base_dtype, base_storage, datatype_id)

    n_warmup_iters = get_iters_from_env(.true.)
    n_iters = get_iters_from_env(.false.)

    do iter = 1, n_warmup_iters
      call plan%execute(a, b)
    enddo

    ts = MPI_Wtime()
    do iter = 1, n_iters
      call plan%execute(a, b)
    enddo
    te = MPI_Wtime()
    call MPI_Allreduce(te - ts, elapsed_time, 1, MPI_REAL8, MPI_SUM, cart_comm, ierr)
    call MPI_Comm_size(cart_comm, comm_size, ierr)
    elapsed_time = real(elapsed_time, real64) / real(comm_size, real64)

    call plan%destroy()
    PHASE_END(phase_name)
    deallocate(phase_name)
    WRITE_INFO("        Average execution time: "//double_to_str(elapsed_time))
  end function get_plan_execution_time
end module dtfft_transpose_plan_host