dtfft_plan.F90 Source File


This file depends on

sourcefile~~dtfft_plan.f90~~EfferentGraph sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_abstract_executor.f90 dtfft_abstract_executor.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_abstract_transpose_plan.f90 dtfft_abstract_transpose_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_executor_cufft_m.f90 dtfft_executor_cufft_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_cufft_m.f90 sourcefile~dtfft_executor_fftw_m.f90 dtfft_executor_fftw_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_fftw_m.f90 sourcefile~dtfft_executor_mkl_m.f90 dtfft_executor_mkl_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_mkl_m.f90 sourcefile~dtfft_executor_vkfft_m.f90 dtfft_executor_vkfft_m.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_executor_vkfft_m.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvshmem.f90 dtfft_interface_nvshmem.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_kernel.f90 dtfft_nvrtc_kernel.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90 dtfft_pencil.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90 dtfft_transpose_plan_cuda.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_cuda.f90 sourcefile~dtfft_transpose_plan_host.f90 dtfft_transpose_plan_host.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan_host.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_executor.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_backend.f90 dtfft_abstract_backend.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_interface_nccl.f90 dtfft_interface_nccl.F90 sourcefile~dtfft_abstract_transpose_plan.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cufft.f90 dtfft_interface_cufft.F90 sourcefile~dtfft_executor_cufft_m.f90->sourcefile~dtfft_interface_cufft.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_fftw_m.f90 dtfft_interface_fftw_m.F90 sourcefile~dtfft_executor_fftw_m.f90->sourcefile~dtfft_interface_fftw_m.f90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_mkl_m.f90 dtfft_interface_mkl_m.F90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_interface_mkl_m.f90 sourcefile~dtfft_interface_mkl_native_m.f90 dtfft_interface_mkl_native_m.F90 sourcefile~dtfft_executor_mkl_m.f90->sourcefile~dtfft_interface_mkl_native_m.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_abstract_executor.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_vkfft_m.f90 dtfft_interface_vkfft_m.F90 sourcefile~dtfft_executor_vkfft_m.f90->sourcefile~dtfft_interface_vkfft_m.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvshmem.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cuda.f90 dtfft_interface_cuda.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_interface_nvrtc.f90 dtfft_interface_nvrtc.F90 sourcefile~dtfft_nvrtc_kernel.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_pencil.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_cuda.f90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_interface_nvrtc.f90 sourcefile~dtfft_transpose_handle_cuda.f90 dtfft_transpose_handle_cuda.F90 sourcefile~dtfft_transpose_plan_cuda.f90->sourcefile~dtfft_transpose_handle_cuda.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_abstract_transpose_plan.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_host.f90 dtfft_transpose_handle_host.F90 sourcefile~dtfft_transpose_plan_host.f90->sourcefile~dtfft_transpose_handle_host.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_backend.f90->sourcefile~dtfft_interface_nccl.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_cufft.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cufft.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_mkl_m.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_nvrtc.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_interface_vkfft_m.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_vkfft_m.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_nvrtc_kernel.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90 dtfft_backend_cufftmp.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_cufftmp.f90 sourcefile~dtfft_backend_mpi.f90 dtfft_backend_mpi.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_mpi.f90 sourcefile~dtfft_backend_nccl.f90 dtfft_backend_nccl.F90 sourcefile~dtfft_transpose_handle_cuda.f90->sourcefile~dtfft_backend_nccl.f90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_transpose_handle_host.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_nvshmem.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_pencil.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_cufftmp.f90->sourcefile~dtfft_interface_cufft.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_mpi.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_abstract_backend.f90 sourcefile~dtfft_backend_nccl.f90->sourcefile~dtfft_interface_nccl.f90

Files dependent on this one

sourcefile~~dtfft_plan.f90~~AfferentGraph sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
#include "dtfft_config.h"
#include "dtfft.f03"
!------------------------------------------------------------------------------------------------
module dtfft_plan
!! This module describes [[dtfft_plan_t]], [[dtfft_plan_c2c_t]], [[dtfft_plan_r2c_t]] and [[dtfft_plan_r2r_t]] types
use iso_c_binding,                    only: c_loc, c_f_pointer, c_ptr, c_bool, c_null_ptr
use iso_fortran_env,                  only: int8, int32, int64, real32, real64, output_unit, error_unit
use dtfft_abstract_executor,          only: abstract_executor, FFT_1D, FFT_2D, FFT_C2C, FFT_R2C, FFT_R2R
use dtfft_abstract_transpose_plan,    only: abstract_transpose_plan
#ifdef DTFFT_WITH_FFTW
use dtfft_executor_fftw_m,            only: fftw_executor
#endif
#ifdef DTFFT_WITH_MKL
use dtfft_executor_mkl_m,             only: mkl_executor
#endif
#ifdef DTFFT_WITH_CUFFT
use dtfft_executor_cufft_m,           only: cufft_executor
#endif
#ifdef DTFFT_WITH_VKFFT
use dtfft_executor_vkfft_m,           only: vkfft_executor
#endif
use dtfft_config
use dtfft_pencil,                     only: pencil, get_local_sizes_private => get_local_sizes, dtfft_pencil_t
use dtfft_parameters
use dtfft_transpose_plan_host,        only: transpose_plan_host
use dtfft_utils
#ifdef DTFFT_WITH_CUDA
use dtfft_interface_cuda_runtime
use dtfft_nvrtc_kernel,               only: clean_unused_cache
use dtfft_transpose_plan_cuda,        only: transpose_plan_cuda
#endif
#ifdef DTFFT_WITH_NVSHMEM
use dtfft_interface_nvshmem,          only: is_nvshmem_ptr
#endif
#include "dtfft_cuda.h"
#include "dtfft_mpi.h"
#include "dtfft_private.h"
#include "dtfft_profile.h"
implicit none
private
public :: dtfft_plan_t
public :: dtfft_plan_c2c_t
public :: dtfft_plan_r2c_t
public :: dtfft_plan_r2r_t


#define CHECK_INPUT_PARAMETER(param, check_func, code)            \
  if ( .not.check_func(param)) then;                              \
    __FUNC__ = code;                                              \
    return;                                                       \
  endif

#define CHECK_ERROR_AND_RETURN                      \
  if ( ierr /= DTFFT_SUCCESS ) then;                \
    if ( present( error_code ) ) error_code = ierr; \
    WRITE_ERROR( dtfft_get_error_string(ierr) );    \
    return;                                         \
  endif

#define CHECK_ERROR_AND_RETURN_NO_MSG               \
  if ( ierr /= DTFFT_SUCCESS ) then;                \
    if ( present( error_code ) ) error_code = ierr; \
    return;                                         \
  endif

#define CHECK_OPTIONAL_CALL( func )                 \
  ierr = func;                                      \
  CHECK_ERROR_AND_RETURN

  type :: fft_executor
  !! FFT handle
    class(abstract_executor), allocatable :: fft
    !! Executor
  end type fft_executor

  type, abstract :: dtfft_plan_t
  !! Abstract class for all ``dtFFT`` plans
  private
    integer(int8)                                 :: ndims
      !! Number of global dimensions
    integer(int32),                   allocatable :: dims(:)
      !! Global dimensions
    type(dtfft_precision_t)                       :: precision
      !! Precision of transform
    logical                                       :: is_created = .false.
      !! Plan creation flag
    logical                                       :: is_transpose_plan = .false.
      !! Plan is transpose only
    logical                                       :: is_aux_alloc = .false.
      !! Auxiliary buffer is allocated internally
    logical                                       :: is_z_slab = .false.
      !! Using Z-slab optimization
      !!
      !! Only 3D plan.
      !!
      !! When .true., then data is distributed only Z direction and it creates a possibility for optimization:
      !!
      !! - 2 dimensional FFT plan is created for both X and Y dimensions
      !! - Single call to MPI_Alltoall is required to transpose data from X-align to Z align
      !!
      !! For CUDA build this optimization means single CUDA kernel that tranposes data directly from X to Z
    type(dtfft_effort_t)                          :: effort
      !! User defined type of effort
    integer(int64)                                :: storage_size
      !! Single element size in bytes
    type(dtfft_executor_t)                        :: executor
      !! FFT executor type
    TYPE_MPI_COMM                                 :: comm
      !! Grid communicator
    TYPE_MPI_COMM,                    allocatable :: comms(:)
      !! Local 1d communicators
    class(abstract_transpose_plan),   allocatable :: plan
      !! Transpose plan handle
    type(pencil),                     allocatable :: pencils(:)
      !! Information about data aligment and datatypes
    type(dtfft_platform_t)                        :: platform
      !! Execution platform
#ifdef DTFFT_WITH_CUDA
    type(dtfft_stream_t)                          :: stream
      !! CUDA Stream associated with current plan
#endif
    type(c_ptr)                                   :: aux_ptr
      !! Auxiliary pointer
    type(fft_executor),               allocatable :: fft(:)
      !! Internal fft runners
    integer(int32),                   allocatable :: fft_mapping(:)
      !! Memory and plan creation optimization.
      !! In case same FFTs needs to be run in different dimensions
      !! only single FFT plan needs to be created
  contains
  private
    procedure,  pass(self), non_overridable, public :: transpose          !! Performs single transposition
    procedure,  pass(self), non_overridable, public :: transpose_ptr      !! Performs single transposition using type(c_ptr) pointers instead of buffers
    procedure,  pass(self), non_overridable, public :: execute            !! Executes plan
    procedure,  pass(self), non_overridable, public :: execute_ptr        !! Executes plan using type(c_ptr) pointers instead of buffers
    procedure,  pass(self), non_overridable, public :: destroy            !! Destroys plan
    procedure,  pass(self), non_overridable, public :: get_local_sizes    !! Returns local starts and counts in `real` and `fourier` spaces
    procedure,  pass(self), non_overridable, public :: get_alloc_size     !! Wrapper around ``get_local_sizes`` to obtain number of elements only
    procedure,  pass(self), non_overridable, public :: get_z_slab_enabled !! Returns logical value is Z-slab optimization is enabled
    procedure,  pass(self), non_overridable, public :: get_pencil         !! Returns pencil decomposition
    procedure,  pass(self), non_overridable, public :: get_element_size   !! Returns number of bytes required to store single element.
    procedure,  pass(self), non_overridable, public :: get_alloc_bytes    !! Returns minimum number of bytes required to execute plan
    procedure,  pass(self), non_overridable, public :: report             !! Prints plan details
    procedure,  pass(self), non_overridable, public :: mem_alloc_ptr      !! Allocates memory for type(c_ptr)
    generic,                                 public :: mem_alloc =>       &
                                                       mem_alloc_r32_1d,  &
                                                       mem_alloc_r64_1d,  &
                                                       mem_alloc_r32_2d,  &
                                                       mem_alloc_r64_2d,  &
                                                       mem_alloc_r32_3d,  &
                                                       mem_alloc_r64_3d,  &
                                                       mem_alloc_c32_1d,  &
                                                       mem_alloc_c64_1d,  &
                                                       mem_alloc_c32_2d,  &
                                                       mem_alloc_c64_2d,  &
                                                       mem_alloc_c32_3d,  &
                                                       mem_alloc_c64_3d
      !! Allocates memory specific for this plan
    procedure,  pass(self), non_overridable, public :: mem_free_ptr       !! Frees previously allocated memory for type(c_ptr)
    generic,                                 public :: mem_free =>        &
                                                       mem_free_r32_1d,   &
                                                       mem_free_r32_2d,   &
                                                       mem_free_r32_3d,   &
                                                       mem_free_r64_1d,   &
                                                       mem_free_r64_2d,   &
                                                       mem_free_r64_3d,   &
                                                       mem_free_c32_1d,   &
                                                       mem_free_c32_2d,   &
                                                       mem_free_c32_3d,   &
                                                       mem_free_c64_1d,   &
                                                       mem_free_c64_2d,   &
                                                       mem_free_c64_3d
    !! Frees previously allocated memory specific for this plan
#ifdef DTFFT_WITH_CUDA
    procedure,  pass(self), non_overridable, public :: get_platform       !! Returns plan execution platform
    procedure,  pass(self), non_overridable, public :: get_backend        !! Returns selected GPU backend during autotuning
    generic,                                 public :: get_stream       & !! Returns CUDA stream associated with plan
                                                    => get_stream_ptr,  &
                                                       get_stream_int64
    procedure,  pass(self), non_overridable         :: get_stream_ptr     !! Returns CUDA stream associated with plan
    procedure,  pass(self), non_overridable         :: get_stream_int64   !! Returns CUDA stream associated with plan
#endif
    procedure,  pass(self), non_overridable         :: execute_private    !! Executes plan
    procedure,  pass(self), non_overridable         :: check_create_args  !! Check arguments provided to `create` subroutines
    procedure,  pass(self), non_overridable         :: create_private     !! Creates core
    procedure,  pass(self), non_overridable         :: alloc_fft_plans    !! Allocates `fft_executor` classes
    procedure,  pass(self), non_overridable         :: check_aux          !! Checks if aux buffer was passed
                                                                          !! and if not will allocate one internally
    procedure,  pass(self), non_overridable         :: mem_alloc_r32_1d   !! Allocates memory for 1d real32 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_r64_1d   !! Allocates memory for 1d real64 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_r32_2d   !! Allocates memory for 2d real32 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_r64_2d   !! Allocates memory for 2d real64 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_r32_3d   !! Allocates memory for 2d real32 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_r64_3d   !! Allocates memory for 2d real64 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_c32_1d   !! Allocates memory for 1d complex32 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_c64_1d   !! Allocates memory for 1d complex64 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_c32_2d   !! Allocates memory for 2d complex32 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_c64_2d   !! Allocates memory for 2d complex64 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_c32_3d   !! Allocates memory for 3d complex32 pointer
    procedure,  pass(self), non_overridable         :: mem_alloc_c64_3d   !! Allocates memory for 3d complex64 pointer
    procedure,  pass(self), non_overridable         :: mem_free_r32_1d    !! Frees real32 1d pointer
    procedure,  pass(self), non_overridable         :: mem_free_r64_1d    !! Frees real64 1d pointer
    procedure,  pass(self), non_overridable         :: mem_free_r32_2d    !! Frees real32 2d pointer
    procedure,  pass(self), non_overridable         :: mem_free_r64_2d    !! Frees real64 2d pointer
    procedure,  pass(self), non_overridable         :: mem_free_r32_3d    !! Frees real32 3d pointer
    procedure,  pass(self), non_overridable         :: mem_free_r64_3d    !! Frees real64 3d pointer
    procedure,  pass(self), non_overridable         :: mem_free_c32_1d    !! Frees complex32 1d pointer
    procedure,  pass(self), non_overridable         :: mem_free_c64_1d    !! Frees complex64 1d pointer
    procedure,  pass(self), non_overridable         :: mem_free_c32_2d    !! Frees complex32 2d pointer
    procedure,  pass(self), non_overridable         :: mem_free_c64_2d    !! Frees complex64 2d pointer
    procedure,  pass(self), non_overridable         :: mem_free_c32_3d    !! Frees complex32 3d pointer
    procedure,  pass(self), non_overridable         :: mem_free_c64_3d    !! Frees complex64 3d pointer
  end type dtfft_plan_t

  type, abstract, extends(dtfft_plan_t) :: dtfft_core_c2c
  !! Abstract C2C Plan
  private
  contains
  private
    procedure, pass(self), non_overridable          :: create_c2c_internal  !! Creates plan for both C2C and R2C
  end type dtfft_core_c2c

  type, extends(dtfft_core_c2c) :: dtfft_plan_c2c_t
  !! C2C Plan
  private
  contains
  private
    procedure, pass(self), non_overridable,  public :: create => create_c2c   !! Creates C2C plan
  end type dtfft_plan_c2c_t

  type, extends(dtfft_core_c2c) :: dtfft_plan_r2c_t
  !! R2C Plan
  private
    type(pencil)  :: real_pencil
      !! "Real" pencil decomposition info
  contains
  private
    procedure, pass(self), non_overridable,  public :: create => create_r2c   !! Creates R2C plan
  end type dtfft_plan_r2c_t

  type, extends(dtfft_plan_t) :: dtfft_plan_r2r_t
  !! R2R Plan
  private
  contains
  private
    procedure, pass(self),  public  :: create => create_r2r                   !! Creates R2R plan
  end type dtfft_plan_r2r_t

contains

  subroutine transpose(self, in, out, transpose_type, error_code)
  !! Performs single transposition
  !!
  !! @note
  !! Buffers `in` and `out` cannot be the same
  !! @endnote
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    type(*),  target,           intent(inout) :: in(..)
      !! Incoming buffer of any rank and kind. Note that this buffer
      !! will be modified in GPU build
    type(*),  target,           intent(inout) :: out(..)
      !! Resulting buffer of any rank and kind
    type(dtfft_transpose_t),    intent(in)    :: transpose_type
      !! Type of transposition.
    integer(int32),   optional, intent(out)   :: error_code
      !! Optional error code returned to user
    call self%transpose_ptr(c_loc(in), c_loc(out), transpose_type, error_code)
  end subroutine transpose

  subroutine transpose_ptr(self, in, out, transpose_type, error_code)
  !! Performs single transposition using type(c_ptr) pointers instead of buffers
  !!
  !! @note
  !! Buffers `in` and `out` cannot be the same
  !! @endnote
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    type(c_ptr),                intent(in)    :: in
      !! Incoming pointer. Note that values of this pointer
      !! will be modified in GPU build
    type(c_ptr),                intent(in)    :: out
      !! Resulting pointer
    type(dtfft_transpose_t),    intent(in)    :: transpose_type
      !! Type of transposition.
    integer(int32),   optional, intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr    !! Error code

    ierr = DTFFT_SUCCESS
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    if ( .not.is_valid_transpose_type(transpose_type)                                           &
         .or. ( self%ndims == 2 .and. abs(transpose_type%val) > 1 )                             &
         .or. (abs(transpose_type%val) == DTFFT_TRANSPOSE_X_TO_Z%val .and..not.self%is_z_slab)) &
      ierr = DTFFT_ERROR_INVALID_TRANSPOSE_TYPE
    CHECK_ERROR_AND_RETURN
    if ( is_same_ptr(in, out) )                                                                 &
      ierr = DTFFT_ERROR_INPLACE_TRANSPOSE
    CHECK_ERROR_AND_RETURN
    select type( self )
    class is ( dtfft_plan_r2c_t )
      ierr = DTFFT_ERROR_R2C_TRANSPOSE_CALLED
      CHECK_ERROR_AND_RETURN
    endselect
#ifdef DTFFT_WITH_CUDA
    if ( self%platform == DTFFT_PLATFORM_CUDA ) then
      ierr = check_device_pointers(in, out, self%plan%get_backend(), c_null_ptr)
      CHECK_ERROR_AND_RETURN
    endif
#endif

    REGION_BEGIN("dtfft_transpose", COLOR_TRANSPOSE)
    call self%plan%execute(in, out, transpose_type)
    REGION_END("dtfft_transpose")
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine transpose_ptr

  subroutine execute(self, in, out, execute_type, aux, error_code)
  !! Executes plan
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    type(*),  target,           intent(inout) :: in(..)
      !! Incoming buffer of any rank and kind
    type(*),  target,           intent(inout) :: out(..)
      !! Resulting buffer of any rank and kind
    type(dtfft_execute_t),      intent(in)    :: execute_type
      !! Type of execution.
    type(*),  target, optional, intent(inout) :: aux(..)
      !! Optional auxiliary buffer.
      !! Size of buffer must be greater than value
      !! returned by `alloc_size` parameter of [[dtfft_plan_t(type):get_local_sizes]] subroutine
    integer(int32),   optional, intent(out)   :: error_code
      !! Optional error code returned to user
    type(c_ptr)     :: aux_ptr

    aux_ptr = c_null_ptr; if( present(aux) ) aux_ptr = c_loc(aux)
    call self%execute_ptr(c_loc(in), c_loc(out), execute_type, aux_ptr, error_code)
  end subroutine execute

  subroutine execute_ptr(self, in, out, execute_type, aux, error_code)
  !! Executes plan using type(c_ptr) pointers instead of buffers
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    type(c_ptr),                intent(in)    :: in
      !! Incoming pointer. Note that values of this pointer
      !! will be modified in GPU build
    type(c_ptr),                intent(in)    :: out
      !! Resulting pointer
    type(dtfft_execute_t),      intent(in)    :: execute_type
      !! Type of execution.
    type(c_ptr),                intent(in)    :: aux
      !! Optional auxiliary buffer.
      !! Size of buffer must be greater than value
      !! returned by `alloc_size` parameter of [[dtfft_plan_t(type):get_local_sizes]] subroutine
    integer(int32),   optional, intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code
    logical         :: inplace  !! Inplace execution flag

    inplace = is_same_ptr(in, out)
    ierr = DTFFT_SUCCESS
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
      CHECK_ERROR_AND_RETURN
    if ( .not.is_valid_execute_type(execute_type) ) ierr = DTFFT_ERROR_INVALID_TRANSPOSE_TYPE
      CHECK_ERROR_AND_RETURN
    if ( self%is_transpose_plan .and. self%ndims == 2 .and. inplace ) ierr = DTFFT_ERROR_INPLACE_TRANSPOSE
      CHECK_ERROR_AND_RETURN
    if ( is_same_ptr(in, aux) .or. is_same_ptr(out, aux) ) ierr = DTFFT_ERROR_INVALID_AUX
      CHECK_ERROR_AND_RETURN
#ifdef DTFFT_WITH_CUDA
    if ( self%platform == DTFFT_PLATFORM_CUDA ) then
      ierr = check_device_pointers(in, out, self%plan%get_backend(), aux)
        CHECK_ERROR_AND_RETURN
    endif
#endif

    REGION_BEGIN("dtfft_execute", COLOR_EXECUTE)
    call self%check_aux(aux)
    if ( .not.is_null_ptr(aux) ) then
      call self%execute_private( in, out, execute_type, aux, inplace )
    else
      call self%execute_private( in, out, execute_type, self%aux_ptr, inplace )
    endif
    REGION_END("dtfft_execute")
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine execute_ptr

  subroutine execute_private(self, in, out, execute_type, aux, inplace)
  !! Executes plan with specified auxiliary buffer
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    type(c_ptr),                intent(in)    :: in
      !! Source pointer
    type(c_ptr),                intent(in)    :: out
      !! Target pointer
    type(dtfft_execute_t),      intent(in)    :: execute_type
      !! Type of execution.
    type(c_ptr),                intent(in)    :: aux
      !! Auxiliary pointer.
    logical,                    intent(in)    :: inplace
      !! Inplace execution flag

    if ( self%is_transpose_plan ) then
      select case ( self%ndims )
      case (2)
        select case( execute_type%val )
        case ( DTFFT_EXECUTE_FORWARD%val )
          call self%plan%execute(in, out, DTFFT_TRANSPOSE_X_TO_Y)
        case ( DTFFT_EXECUTE_BACKWARD%val )
          call self%plan%execute(in, out, DTFFT_TRANSPOSE_Y_TO_X)
        endselect
      case (3)
        select case( execute_type%val )
        case ( DTFFT_EXECUTE_FORWARD%val )
          if ( inplace .or. .not. self%is_z_slab ) then
            call self%plan%execute(in, aux, DTFFT_TRANSPOSE_X_TO_Y)
            call self%plan%execute(aux, out, DTFFT_TRANSPOSE_Y_TO_Z)
            return
          endif
          call self%plan%execute(in, out, DTFFT_TRANSPOSE_X_TO_Z)
        case ( DTFFT_EXECUTE_BACKWARD%val )
          if ( inplace .or. .not. self%is_z_slab ) then
            call self%plan%execute(in, aux, DTFFT_TRANSPOSE_Z_TO_Y)
            call self%plan%execute(aux, out, DTFFT_TRANSPOSE_Y_TO_X)
            return
          endif
          call self%plan%execute(in, out, DTFFT_TRANSPOSE_Z_TO_X)
        endselect
      endselect
      return
    endif ! self%is_transpose_plan

    select case ( execute_type%val )
    case ( DTFFT_EXECUTE_FORWARD%val )
      ! 1d direct FFT X direction || 2d X-Y FFT
      call self%fft(1)%fft%execute(in, aux, FFT_FORWARD)
      if ( self%is_z_slab ) then
        ! Transpose X -> Z
        call self%plan%execute(aux, out, DTFFT_TRANSPOSE_X_TO_Z)
        ! 1d direct FFT Z direction
        call self%fft(3)%fft%execute(out, out, FFT_FORWARD)
        return
      endif
      ! Transpose X -> Y
      call self%plan%execute(aux, out, DTFFT_TRANSPOSE_X_TO_Y)
      ! 1d FFT Y direction
      call self%fft(self%fft_mapping(2))%fft%execute(out, out, FFT_FORWARD)
      if ( self%ndims == 2 ) then
        return
      endif
      ! Transpose Y -> Z
      call self%plan%execute(out, aux, DTFFT_TRANSPOSE_Y_TO_Z)
      ! 1d direct FFT Z direction
      call self%fft(self%fft_mapping(3))%fft%execute(aux, out, FFT_FORWARD)
    case ( DTFFT_EXECUTE_BACKWARD%val )
      if ( self%is_z_slab ) then
        ! 1d inverse FFT Z direction
        call self%fft(3)%fft%execute(in, in, FFT_BACKWARD)
        ! Transpose Z -> X
        call self%plan%execute(in, aux, DTFFT_TRANSPOSE_Z_TO_X)
        ! 2d inverse FFT X-Y direction
        call self%fft(1)%fft%execute(aux, out, FFT_BACKWARD)
        return
      endif
      if ( self%ndims == 3 ) then
        ! 1d inverse FFT Z direction
        call self%fft(self%fft_mapping(3))%fft%execute(in, aux, FFT_BACKWARD)
        ! Transpose Z -> Y
        call self%plan%execute(aux, in, DTFFT_TRANSPOSE_Z_TO_Y)
      endif
      ! 1d inverse FFT Y direction
      call self%fft(self%fft_mapping(2))%fft%execute(in, in, FFT_BACKWARD)
      ! Transpose Y -> X
      call self%plan%execute(in, aux, DTFFT_TRANSPOSE_Y_TO_X)
      ! 1d inverse FFT X direction
      call self%fft(1)%fft%execute(aux, out, FFT_BACKWARD)
    endselect
  end subroutine execute_private

  subroutine destroy(self, error_code)
  !! Destroys plan, frees all memory
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional Error Code returned to user
    integer(int32)  :: d        !! Counter
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN

    REGION_BEGIN("dtfft_destroy", COLOR_DESTROY)

    if ( allocated(self%dims) ) deallocate(self%dims)

    select type ( self )
    class is ( dtfft_plan_r2c_t )
      call self%real_pencil%destroy()
    endselect

    if ( allocated(self%pencils) ) then
      do d = 1, self%ndims
        call self%pencils(d)%destroy()
      enddo
      deallocate(self%pencils)
    endif

    if ( self%is_aux_alloc ) then
      call self%mem_free_ptr(self%aux_ptr)
      self%is_aux_alloc = .false.
    endif

    if ( allocated(self%fft) ) then
      do d = 1, self%ndims
        call self%fft(d)%fft%destroy()
        if ( allocated(self%fft(d)%fft) ) deallocate(self%fft(d)%fft)
      enddo
      deallocate(self%fft)
    endif

    self%is_created = .false.
    self%is_aux_alloc = .false.

#ifdef DTFFT_WITH_CUDA
    call destroy_stream()
#endif

    block
      logical     :: is_finalized

      call MPI_Finalized(is_finalized, ierr)

      if ( is_finalized ) ierr = DTFFT_ERROR_MPI_FINALIZED
      CHECK_ERROR_AND_RETURN
    end block

    if ( allocated( self%plan ) ) then
      call self%plan%destroy()
      deallocate( self%plan )
    endif
#ifdef DTFFT_WITH_CUDA
    call clean_unused_cache()
#endif

    if ( allocated(self%comms) ) then
      do d = 1, self%ndims
        call MPI_Comm_free(self%comms(d), ierr)
      enddo
      deallocate(self%comms)
    endif
    call MPI_Comm_free(self%comm, ierr)

    self%ndims = -1
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
    REGION_END("dtfft_destroy")
  end subroutine destroy

  logical function get_z_slab_enabled(self, error_code)
  !! Returns logical value is Z-slab optimization enabled internally
    class(dtfft_plan_t),        intent(in)    :: self
      !! Abstract plan
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    get_z_slab_enabled = .false.
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    if ( present( error_code ) ) error_code = ierr
    if ( ierr /= DTFFT_SUCCESS ) return

    get_z_slab_enabled = self%is_z_slab
  end function get_z_slab_enabled

  type(dtfft_pencil_t) function get_pencil(self, dim, error_code)
  !! Returns pencil decomposition
    class(dtfft_plan_t),        intent(in)    :: self
      !! Abstract plan
    integer(int32),             intent(in)    :: dim
      !! Required dimension:
      !!
      !!  - 0 for XYZ layout (real space, R2C only)
      !!  - 1 for XYZ layout
      !!  - 2 for YXZ layout
      !!  - 3 for ZXY layout
      !!
      !! [//]: # (ListBreak)
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code
    integer(int32)  :: min_dim

    ierr = DTFFT_SUCCESS
    ! get_pencil = dtfft_pencil_t(-1,-1,-1,-1)
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    select type (self)
    class is ( dtfft_plan_r2c_t )
      min_dim = 0
    class default
      min_dim = 1
    endselect

    if ( dim < min_dim .or. dim > self%ndims ) ierr = DTFFT_ERROR_INVALID_DIM
    CHECK_ERROR_AND_RETURN

    if ( dim == 0 ) then
      select type (self)
      class is ( dtfft_plan_r2c_t )
        get_pencil = self%real_pencil%make_public()
      endselect
    else
      get_pencil = self%pencils(dim)%make_public()
    endif
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end function get_pencil

  integer(int64) function get_element_size(self, error_code)
  !! Returns number of bytes required to store single element.
    class(dtfft_plan_t),        intent(in)    :: self
      !! Abstract plan
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    get_element_size = 0
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    select type (self)
    class is ( dtfft_plan_r2c_t )
      get_element_size = self%storage_size / 2
    class default
      get_element_size = self%storage_size
    endselect
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end function get_element_size

  integer(int64) function get_alloc_bytes(self, error_code)
  !! Returns minimum number of bytes required to execute plan
    class(dtfft_plan_t),        intent(in)    :: self
      !! Abstract plan
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr         !! Error code
    integer(int64)  :: alloc_size   !! Number of elements required
    integer(int64)  :: element_size !! Size of each element

    ierr = DTFFT_SUCCESS
    get_alloc_bytes = 0
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN

    alloc_size = self%get_alloc_size()
    element_size = self%get_element_size()
    get_alloc_bytes = alloc_size * element_size
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end function get_alloc_bytes

  subroutine report(self, error_code)
  !! Prints plan-related information to stdout
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr                   !! Error code
    integer(int32)  :: comm_dims(self%ndims)  !! Communicator dimensions
    integer(int32)  :: coords(self%ndims)     !! Coordinates of this process in the grid
    logical         :: periods(self%ndims)    !! Periodic dimensions

    ierr = DTFFT_SUCCESS
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    WRITE_REPORT("**Plan report**")
      WRITE_REPORT("  dtFFT Version        :  "//int_to_str(dtfft_get_version()))
      WRITE_REPORT("  Number of dimensions :  "//int_to_str(self%ndims))

    call MPI_Cart_get(self%comm, int(self%ndims, int32), comm_dims, periods, coords, ierr)
    if ( self%ndims == 2 ) then
      WRITE_REPORT("  Global dimensions    :  "//int_to_str(self%dims(1))//"x"//int_to_str(self%dims(2)))
      WRITE_REPORT("  Grid decomposition   :  "//int_to_str(comm_dims(1))//"x"//int_to_str(comm_dims(2)))
    else
      WRITE_REPORT("  Global dimensions    :  "//int_to_str(self%dims(1))//"x"//int_to_str(self%dims(2))//"x"//int_to_str(self%dims(3)))
      WRITE_REPORT("  Grid decomposition   :  "//int_to_str(comm_dims(1))//"x"//int_to_str(comm_dims(2))//"x"//int_to_str(comm_dims(3)))
    endif
#ifdef DTFFT_WITH_CUDA
    if ( self%platform == DTFFT_PLATFORM_HOST ) then
      WRITE_REPORT("  Execution platform   :  HOST")
    else
      WRITE_REPORT("  Execution platform   :  CUDA")
    endif
#endif
    select type( self )
    class is ( dtfft_plan_c2c_t )
      WRITE_REPORT("  Plan type            :  Complex-to-Complex")
    class is ( dtfft_plan_r2r_t )
      WRITE_REPORT("  Plan type            :  Real-to-Real")
    class is ( dtfft_plan_r2c_t )
      WRITE_REPORT("  Plan type            :  Real-to-Complex")
    endselect
    if ( self%precision == DTFFT_SINGLE ) then
      WRITE_REPORT("  Plan precision       :  Single")
    else
      WRITE_REPORT("  Plan precision       :  Double")
    endif
    if ( self%is_transpose_plan ) then
      WRITE_REPORT("  FFT Executor type    :  None")
    else
      select case ( self%executor%val )
      case ( DTFFT_EXECUTOR_FFTW3%val )
        WRITE_REPORT("  FFT Executor type    :  FFTW3")
      case ( DTFFT_EXECUTOR_MKL%val )
        WRITE_REPORT("  FFT Executor type    :  MKL")
      case ( DTFFT_EXECUTOR_CUFFT%val )
        WRITE_REPORT("  FFT Executor type    :  CUFFT")
      case ( DTFFT_EXECUTOR_VKFFT%val )
        WRITE_REPORT("  FFT Executor type    :  VkFFT")
      endselect
    endif
    if ( self%ndims == 3 ) then
      if ( self%is_z_slab ) then
        WRITE_REPORT("  Z-slab enabled       :  True")
      else
        WRITE_REPORT("  Z-slab enabled       :  False")
      endif
    endif
#ifdef DTFFT_WITH_CUDA
    if ( self%platform == DTFFT_PLATFORM_CUDA ) then
      WRITE_REPORT("  GPU Backend          :  "//dtfft_get_backend_string(self%plan%get_backend()))
    endif
#endif
    WRITE_REPORT("**End of report**")
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine report

#ifdef DTFFT_WITH_CUDA
  type(dtfft_platform_t) function get_platform(self, error_code)
    !! Returns execution platform of the plan (HOST or CUDA)
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    get_platform = PLATFORM_NOT_SET
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    if ( present( error_code ) ) error_code = ierr
    if ( ierr /= DTFFT_SUCCESS ) return

    get_platform = self%platform
  end function get_platform

  type(dtfft_backend_t) function get_backend(self, error_code)
  !! Returns selected GPU backend during autotuning
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    get_backend = BACKEND_NOT_SET
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    if ( present( error_code ) ) error_code = ierr
    if ( ierr /= DTFFT_SUCCESS ) return

    get_backend = self%plan%get_backend()
  end function get_backend

  subroutine get_stream_ptr(self, stream, error_code)
  !! Returns CUDA stream associated with plan
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    type(dtfft_stream_t),       intent(out) :: stream
      !! dtFFT Stream
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    stream = NULL_STREAM
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    if ( self%platform == DTFFT_PLATFORM_HOST ) ierr = DTFFT_ERROR_INVALID_USAGE
    CHECK_ERROR_AND_RETURN

    stream = self%stream
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine get_stream_ptr

  subroutine get_stream_int64(self, stream, error_code)
  !! Returns CUDA stream associated with plan
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    integer(int64),             intent(out) :: stream
      !! CUDA-Fortran Stream
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int32)        :: ierr     !! Error code
    type(dtfft_stream_t)  :: stream_  !! dtFFT Stream

    call self%get_stream(stream_, error_code=ierr)
    if ( ierr == DTFFT_SUCCESS ) stream = dtfft_get_cuda_stream(stream_)
    if ( present( error_code ) ) error_code = ierr
  end subroutine get_stream_int64

  integer(int32) function check_device_pointers(in, out, backend, aux) result(error_code)
  !! Checks if device pointers are provided by user
    type(c_ptr),            intent(in)  :: in
      !! First pointer
    type(c_ptr),            intent(in)  :: out
      !! Second pointer
    type(dtfft_backend_t),  intent(in)  :: backend
      !! Backend. Required to check for `nvshmem` pointer
    type(c_ptr),            intent(in)  :: aux
      !! Optional auxiliary pointer.
    logical(c_bool) :: is_devptr    !! Are pointers device pointers?

    error_code = DTFFT_SUCCESS

    if ( is_backend_nvshmem(backend) ) then
#ifdef DTFFT_WITH_NVSHMEM
      is_devptr = is_nvshmem_ptr(in) .and. is_nvshmem_ptr(out)
      if ( .not. is_null_ptr(aux) ) is_devptr = is_devptr .and. is_nvshmem_ptr(aux)
      if ( .not. is_devptr ) error_code = DTFFT_ERROR_NOT_NVSHMEM_PTR
#endif
    else
      is_devptr = is_device_ptr(in) .and. is_device_ptr(out)
      if ( .not. is_null_ptr(aux) ) is_devptr = is_devptr .and. is_device_ptr(aux)
      if ( .not. is_devptr ) error_code = DTFFT_ERROR_NOT_DEVICE_PTR
    endif
  end function check_device_pointers
#endif

  subroutine get_local_sizes(self, in_starts, in_counts, out_starts, out_counts, alloc_size, error_code)
  !! Obtain local starts and counts in ``real`` and ``fourier`` spaces
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    integer(int32), optional,   intent(out) :: in_starts(:)
      !! Starts of local portion of data in ``real`` space (0-based)
    integer(int32), optional,   intent(out) :: in_counts(:)
      !! Number of elements of local portion of data in 'real' space
    integer(int32), optional,   intent(out) :: out_starts(:)
      !! Starts of local portion of data in ``fourier`` space (0-based)
    integer(int32), optional,   intent(out) :: out_counts(:)
      !! Number of elements of local portion of data in ``fourier`` space
    integer(int64), optional,   intent(out) :: alloc_size
      !! Minimal number of elements required to execute plan
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr                 !! Error code

    ierr = DTFFT_SUCCESS
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    if ( .not. present(in_starts)       &
    .and..not. present(in_counts)       &
    .and..not. present(out_starts)      &
    .and..not. present(out_counts)      &
    .and..not. present(alloc_size)) ierr = DTFFT_ERROR_INVALID_USAGE
    CHECK_ERROR_AND_RETURN

    select type ( self )
    class is (dtfft_plan_r2c_t)
      if (present( in_starts ) )    in_starts(1:self%ndims)   = self%real_pencil%starts(1:self%ndims)
      if (present( in_counts ) )    in_counts(1:self%ndims)   = self%real_pencil%counts(1:self%ndims)
      call get_local_sizes_private(self%pencils, out_starts=out_starts, out_counts=out_counts, alloc_size=alloc_size)
      if ( present( alloc_size ) ) alloc_size = max(int(product(self%real_pencil%counts), int64), 2 * alloc_size)
    class default
      call get_local_sizes_private(self%pencils, in_starts, in_counts, out_starts, out_counts, alloc_size)
    endselect
#ifdef DTFFT_WITH_CUDA
    if ( is_backend_nvshmem( self%get_backend() ) .and. present(alloc_size) ) then
      call MPI_Allreduce(MPI_IN_PLACE, alloc_size, 1, MPI_INTEGER8, MPI_MAX, self%comm, ierr)
    endif
#endif
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine get_local_sizes

  function get_alloc_size(self, error_code) result(alloc_size)
  !! Wrapper around ``get_local_sizes`` to obtain number of elements only
    class(dtfft_plan_t),        intent(in)  :: self
      !! Abstract plan
    integer(int32), optional,   intent(out) :: error_code
      !! Optional error code returned to user
    integer(int64)                          :: alloc_size
      !! Minimal number of elements required to execute plan

    call self%get_local_sizes(alloc_size=alloc_size, error_code=error_code)
  end function get_alloc_size

  integer(int32) function create_private(self, dims, sngl_type, sngl_storage_size, dbl_type, dbl_storage_size, comm, precision, effort, executor, kinds)
#define __FUNC__ create_private
  !! Creates core
    class(dtfft_plan_t),              intent(inout) :: self
      !! Abstract plan
    integer(int32),                   intent(in)    :: dims(:)
      !! Global dimensions of transform
    TYPE_MPI_DATATYPE,                intent(in)    :: sngl_type
      !! MPI_Datatype for single precision plan
    integer(int64),                   intent(in)    :: sngl_storage_size
      !! Number of bytes needed to store single element (single precision)
    TYPE_MPI_DATATYPE,                intent(in)    :: dbl_type
      !! MPI_Datatype for double precision plan
    integer(int64),                   intent(in)    :: dbl_storage_size
      !! Number of bytes needed to store single element (double precision)
    TYPE_MPI_COMM,          optional, intent(in)    :: comm
      !! User-defined communicator
    type(dtfft_precision_t),optional, intent(in)    :: precision
      !! Precision of transform: `DTFFT_SINGLE` or `DTFFT_DOUBLE`
    type(dtfft_effort_t),   optional, intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    type(dtfft_executor_t), optional, intent(in)    :: executor
      !! Type of External FFT Executor
    type(dtfft_r2r_kind_t), optional, intent(in)    :: kinds(:)
      !! Kinds of R2R transform
    TYPE_MPI_DATATYPE       :: base_dtype           !! MPI_Datatype for current precision
    integer(int64)          :: base_storage         !! Number of bytes needed to store single element
    TYPE_MPI_COMM           :: comm_                !! MPI Communicator

    create_private = DTFFT_SUCCESS
    CHECK_INTERNAL_CALL( self%check_create_args(dims, comm, precision, effort, executor, kinds) )

    select case ( self%precision%val )
    case ( DTFFT_SINGLE%val )
      base_storage = sngl_storage_size
      base_dtype = sngl_type
    case ( DTFFT_DOUBLE%val )
      base_storage = dbl_storage_size
      base_dtype = dbl_type
    case default
      INTERNAL_ERROR("unknown precision")
    endselect
    self%storage_size = base_storage

    allocate(self%pencils(self%ndims))
    allocate(self%comms(self%ndims))

    comm_ = MPI_COMM_WORLD; if ( present( comm ) ) comm_ = comm
#ifdef DTFFT_WITH_CUDA
    if ( self%platform == DTFFT_PLATFORM_CUDA ) then
      block
        TYPE_MPI_COMM   :: local_comm
        integer(int32)  :: n_devices, current_device, local_rank, local_size, ierr
        integer(int32), allocatable :: local_devices(:)

        call MPI_Comm_split_Type(comm_, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr)
        call MPI_Comm_size(local_comm, local_size, ierr)
        call MPI_Comm_rank(local_comm, local_rank, ierr)

        allocate( local_devices(local_size) )

        CUDA_CALL( "cudaGetDeviceCount", cudaGetDeviceCount(n_devices) )
        CUDA_CALL( "cudaGetDevice", cudaGetDevice(current_device) )

        call MPI_Allgather(current_device, 1, MPI_INTEGER4, local_devices, 1, MPI_INTEGER4, local_comm, ierr)
        call MPI_Comm_free(local_comm, ierr)
        if ( count_unique(local_devices) /= local_size ) then
          create_private = DTFFT_ERROR_GPU_NOT_SET
          return
        endif

        deallocate( local_devices )
      endblock
    endif

    if ( get_user_gpu_backend() == DTFFT_BACKEND_MPI_DATATYPE .or. self%platform == DTFFT_PLATFORM_HOST ) then
      allocate( transpose_plan_host :: self%plan )
    else
      allocate( transpose_plan_cuda :: self%plan )
      self%stream = get_user_stream()
    endif
#else
    allocate( transpose_plan_host :: self%plan )
#endif

    CHECK_INTERNAL_CALL( self%plan%create(dims, comm_, self%effort, base_dtype, base_storage, self%comm, self%comms, self%pencils) )
    self%is_z_slab = self%plan%is_z_slab
    call self%alloc_fft_plans(kinds)

    self%is_aux_alloc = .false.
#undef __FUNC__
  end function create_private

  integer(int32) function check_create_args(self, dims, comm, precision, effort, executor, kinds)
#define __FUNC__ check_create_args
  !! Check arguments provided by user and sets private variables
    class(dtfft_plan_t),                intent(inout) :: self
      !! Abstract plan
    integer(int32),                     intent(in)    :: dims(:)
      !! Global dimensions of transform
    TYPE_MPI_COMM,          optional,   intent(in)    :: comm
      !! Optional MPI Communicator
    type(dtfft_precision_t),optional,   intent(in)    :: precision
      !! Precision of transform: `DTFFT_SINGLE` or `DTFFT_DOUBLE`
    type(dtfft_effort_t),   optional,   intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    type(dtfft_executor_t), optional,   intent(in)    :: executor
      !! Type of External FFT Executor
    type(dtfft_r2r_kind_t), optional,   intent(in)    :: kinds(:)
      !! Kinds of R2R transform
    integer(int32)          :: ierr             !! Error code
    integer(int32)          :: top_type         !! MPI Comm topology type
    integer(int32)          :: dim              !! Counter

    CHECK_INTERNAL_CALL( init_internal() )

    self%platform = get_user_platform()

    self%ndims = size(dims, kind=int8)
    CHECK_INPUT_PARAMETER(self%ndims, is_valid_dimension, DTFFT_ERROR_INVALID_N_DIMENSIONS)
    if ( any([(dims(dim) <= 0, dim=1,self%ndims)]) ) then
      check_create_args = DTFFT_ERROR_INVALID_DIMENSION_SIZE
      return
    endif
    allocate( self%dims, source=dims )

    if ( present(comm) ) then
      call MPI_Topo_test(comm, top_type, ierr)
      CHECK_INPUT_PARAMETER(top_type, is_valid_comm_type, DTFFT_ERROR_INVALID_COMM_TYPE)
    endif

    self%precision = DTFFT_DOUBLE
    if ( present(precision) ) then
      CHECK_INPUT_PARAMETER(precision, is_valid_precision, DTFFT_ERROR_INVALID_PRECISION)
      self%precision = precision
    endif

    self%effort = DTFFT_ESTIMATE
    if ( present(effort) ) then
      CHECK_INPUT_PARAMETER(effort, is_valid_effort, DTFFT_ERROR_INVALID_EFFORT)
      self%effort = effort
    endif

    self%is_transpose_plan = .false.
    self%executor = DTFFT_EXECUTOR_NONE
    if ( present(executor) ) then
      CHECK_INPUT_PARAMETER(executor, is_valid_executor, DTFFT_ERROR_INVALID_EXECUTOR)
#ifdef DTFFT_WITH_CUDA
      if ( self%platform == DTFFT_PLATFORM_HOST ) then
        CHECK_INPUT_PARAMETER(executor, is_host_executor, DTFFT_ERROR_INVALID_PLATFORM_EXECUTOR_TYPE)
      else if ( self%platform == DTFFT_PLATFORM_CUDA ) then
        CHECK_INPUT_PARAMETER(executor, is_cuda_executor, DTFFT_ERROR_INVALID_PLATFORM_EXECUTOR_TYPE)
      endif
#endif
      self%executor = executor
    endif
    if ( self%executor == DTFFT_EXECUTOR_NONE ) self%is_transpose_plan = .true.

    if ( present(kinds) .and. .not. self%is_transpose_plan ) then
      do dim = 1, self%ndims
        CHECK_INPUT_PARAMETER(kinds(dim), is_valid_r2r_kind, DTFFT_ERROR_INVALID_R2R_KINDS)
      enddo
    endif
#undef __FUNC__
  end function check_create_args

  subroutine alloc_fft_plans(self, kinds)
  !! Allocates [[abstract_executor]] with required FFT class
  !! and populates [[dtfft_plan_t(type):fft_mapping]] with similar FFT ids
    class(dtfft_plan_t),              intent(inout) :: self
      !! Abstract plan
    type(dtfft_r2r_kind_t), optional, intent(in)    :: kinds(:)
      !! Kinds of R2R transform
    integer(int8)                                   :: dim, dim2
      !! Counters
    type(dtfft_r2r_kind_t),           allocatable   :: kinds_(:)
      !! Dummy kinds

    if ( self%is_transpose_plan ) return

    allocate(self%fft(self%ndims))
    allocate(self%fft_mapping(self%ndims))

    do dim = 1, self%ndims
      self%fft_mapping(dim) = dim

      select case(self%executor%val)
#ifdef DTFFT_WITH_FFTW
      case (DTFFT_EXECUTOR_FFTW3%val)
        if ( dim == 1 ) then
          WRITE_DEBUG("Using FFTW3 executor")
        endif
        allocate(fftw_executor :: self%fft(dim)%fft)
#endif
#ifdef DTFFT_WITH_MKL
      case (DTFFT_EXECUTOR_MKL%val)
        if ( dim == 1 ) then
          WRITE_DEBUG("Using MKL executor")
        endif
        allocate(mkl_executor :: self%fft(dim)%fft)
#endif
#ifdef DTFFT_WITH_CUFFT
      case (DTFFT_EXECUTOR_CUFFT%val)
        if ( dim == 1 ) then
          WRITE_DEBUG("Using CUFFT executor")
        endif
        allocate(cufft_executor :: self%fft(dim)%fft)
#endif
#ifdef DTFFT_WITH_VKFFT
      case (DTFFT_EXECUTOR_VKFFT%val)
        if ( dim == 1 ) then
           WRITE_DEBUG("Using VkFFT executor")
        endif
        allocate(vkfft_executor :: self%fft(dim)%fft)
#endif
      case default
        INTERNAL_ERROR("Executor type unrecognized")
      endselect
    enddo
    if( self%is_z_slab ) return

    allocate(kinds_(self%ndims))
    kinds_(:) = dtfft_r2r_kind_t(-1); if ( present(kinds) ) kinds_(:) = kinds(:)
    ! Searching for similar FFT transforms in order to reduce time of plan creation
    ! and reduce memory usage
    ! Most profitable in GPU build
    do dim = 1_int8, self%ndims
      do dim2 = 1_int8, dim - 1_int8
        if ( dim == dim2 ) cycle
        select type ( self )
        class is ( dtfft_plan_r2c_t )
          if ( dim == 1 ) cycle
        endselect
        if ( self%pencils(dim)%counts(1) == self%pencils(dim2)%counts(1)                    &
             .and. product(self%pencils(dim)%counts) == product(self%pencils(dim2)%counts)  &
             .and. kinds_(dim) == kinds_(dim2) ) then
          self%fft_mapping(dim) = dim2
        endif
      enddo
    enddo
    deallocate(kinds_)
  end subroutine alloc_fft_plans

  subroutine check_aux(self, aux)
  !! Checks if aux buffer was passed by user and if not will allocate one internally
    class(dtfft_plan_t),            intent(inout) :: self
      !! Abstract plan
    type(c_ptr),                    intent(in)    :: aux
      !! Optional auxiliary buffer.
    integer(int64)                                :: alloc_size
      !! Number of elements to be allocated
    character(len=100)                            :: debug_msg
      !! Logging allocation size
    integer(int32) :: ierr

    if ( self%is_aux_alloc .or. .not.is_null_ptr(aux) ) return

    alloc_size = self%get_alloc_size() * self%get_element_size()
    write(debug_msg, '(a, i0, a)') "Allocating auxiliary buffer of ",alloc_size," bytes"
    WRITE_DEBUG(debug_msg)
    ! self%aux_ptr = self%mem_alloc(alloc_size, error_code=ierr); DTFFT_CHECK(ierr)
    call self%mem_alloc_ptr(alloc_size, self%aux_ptr, ierr);  DTFFT_CHECK(ierr)
    self%is_aux_alloc = .true.
  end subroutine check_aux

  subroutine create_r2r(self, dims, kinds, comm, precision, effort, executor, error_code)
  !! R2R Plan Constructor
    class(dtfft_plan_r2r_t),          intent(inout) :: self
      !! R2R Plan
    integer(int32),                   intent(in)    :: dims(:)
      !! Global dimensions of transform
    type(dtfft_r2r_kind_t), optional, intent(in)    :: kinds(:)
      !! Kinds of R2R transform
    TYPE_MPI_COMM,          optional, intent(in)    :: comm
      !! Communicator
    type(dtfft_precision_t),optional, intent(in)    :: precision
      !! Presicion of Transform
    type(dtfft_effort_t),   optional, intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    type(dtfft_executor_t), optional, intent(in)    :: executor
      !! Type of External FFT Executor
    integer(int32),         optional, intent(out)   :: error_code
      !! Optional Error Code returned to user
    integer(int32)          :: ierr               !! Error code
    integer(int8)           :: fft_rank           !! Rank of FFT transform
    integer(int32)          :: dim                !! Counter
    type(dtfft_r2r_kind_t)  :: r2r_kinds(2)       !! Transposed Kinds of R2R transform

    ierr = DTFFT_SUCCESS
    if ( self%is_created ) ierr = DTFFT_ERROR_PLAN_IS_CREATED
    CHECK_ERROR_AND_RETURN

    REGION_BEGIN("dtfft_create_r2r", COLOR_CREATE)
    CHECK_OPTIONAL_CALL( self%create_private(dims, MPI_REAL, FLOAT_STORAGE_SIZE, MPI_REAL8, DOUBLE_STORAGE_SIZE, comm, precision, effort, executor, kinds) )

    if ( .not. self%is_transpose_plan ) then
      if ( .not. present( kinds ) ) ierr = DTFFT_ERROR_MISSING_R2R_KINDS
      CHECK_ERROR_AND_RETURN

      do dim = 1, self%ndims
        r2r_kinds(1) = kinds(dim)
        fft_rank = FFT_1D
        if ( self%is_z_slab .and. dim == 1 ) then
          r2r_kinds(1) = kinds(2)
          r2r_kinds(2) = kinds(1)
          fft_rank = FFT_2D
        endif
        if ( self%is_z_slab .and. dim == 2 ) cycle
        CHECK_OPTIONAL_CALL( self%fft(self%fft_mapping(dim))%fft%create(fft_rank, FFT_R2R, self%precision, real_pencil=self%pencils(dim), r2r_kinds=r2r_kinds) )
      enddo
    endif
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
    self%is_created = .true.
    REGION_END("dtfft_create_r2r")
  end subroutine create_r2r

  subroutine create_c2c(self, dims, comm, precision, effort, executor, error_code)
  !! C2C Plan Constructor
    class(dtfft_plan_c2c_t),            intent(inout) :: self
      !! C2C Plan
    integer(int32),                   intent(in)    :: dims(:)
      !! Global dimensions of transform
    TYPE_MPI_COMM,          optional, intent(in)    :: comm
      !! Communicator
    type(dtfft_precision_t),optional, intent(in)    :: precision
      !! Presicion of Transform
    type(dtfft_effort_t),   optional, intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    type(dtfft_executor_t), optional, intent(in)    :: executor
      !! Type of External FFT Executor
    integer(int32),         optional, intent(out)   :: error_code
      !! Optional Error Code returned to user
    integer(int32)                    :: ierr               !! Error code

    ierr = DTFFT_SUCCESS
    if ( self%is_created ) ierr = DTFFT_ERROR_PLAN_IS_CREATED
    CHECK_ERROR_AND_RETURN

    REGION_BEGIN("create_c2c", COLOR_CREATE)
    CHECK_OPTIONAL_CALL( self%create_c2c_internal(dims, comm, precision, effort, executor) )

    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
    self%is_created = .true.
    REGION_END("create_c2c")
  end subroutine create_c2c

  integer(int32) function create_c2c_internal(self, dims, comm, precision, effort, executor)
  !! Creates plan for both C2C and R2C
#define __FUNC__ create_c2c_internal
    class(dtfft_core_c2c),            intent(inout) :: self
      !! C2C Plan
    integer(int32),                   intent(in)    :: dims(:)
      !! Global dimensions of transform
    TYPE_MPI_COMM,          optional, intent(in)    :: comm
      !! Communicator
    type(dtfft_precision_t),optional, intent(in)    :: precision
      !! Presicion of Transform
    type(dtfft_effort_t),   optional, intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    type(dtfft_executor_t), optional, intent(in)    :: executor
      !! Type of External FFT Executor
    integer(int8)           :: dim                  !! Counter
    integer(int8)           :: fft_start            !! 1 for c2c, 2 for r2c
    integer(int8)           :: fft_rank             !! Rank of FFT transform

    CHECK_INTERNAL_CALL( self%create_private(dims, MPI_COMPLEX, COMPLEX_STORAGE_SIZE, MPI_DOUBLE_COMPLEX, DOUBLE_COMPLEX_STORAGE_SIZE, comm, precision, effort, executor) )

    if ( self%is_transpose_plan ) return
    select type ( self )
    class is (dtfft_plan_r2c_t)
      fft_start = 2
    class default
      fft_start = 1
    endselect
    do dim = fft_start, self%ndims
      fft_rank = FFT_1D;  if( self%is_z_slab .and. dim == 1) fft_rank = FFT_2D
      if ( self%is_z_slab .and. dim == 2_int8 ) cycle
      CHECK_INTERNAL_CALL( self%fft(self%fft_mapping(dim))%fft%create(fft_rank, FFT_C2C, self%precision, complex_pencil=self%pencils(dim)) )
    enddo
#undef __FUNC__
  end function create_c2c_internal

  subroutine create_r2c(self, dims, executor, comm, precision, effort, error_code)
  !! R2C Generic Plan Constructor
    class(dtfft_plan_r2c_t),          intent(inout) :: self
      !! C2C Plan
    integer(int32),                   intent(in)    :: dims(:)
      !! Global dimensions of transform
    type(dtfft_executor_t),           intent(in)    :: executor
    !! Type of External FFT Executor
    TYPE_MPI_COMM,          optional, intent(in)    :: comm
      !! Communicator
    type(dtfft_precision_t),optional, intent(in)    :: precision
      !! Presicion of Transform
    type(dtfft_effort_t),   optional, intent(in)    :: effort
      !! How thoroughly `dtFFT` searches for the optimal plan
    integer(int32),         optional, intent(out)   :: error_code
      !! Optional Error Code returned to user
    integer(int32)                    :: ierr               !! Error code
    integer(int32),   allocatable     :: fixed_dims(:)      !! Fixed dimensions for R2C
    integer(int8)                     :: fft_rank           !! Rank of FFT transform

    ierr = DTFFT_SUCCESS
    if ( self%is_created ) ierr = DTFFT_ERROR_PLAN_IS_CREATED
    CHECK_ERROR_AND_RETURN

    REGION_BEGIN("create_r2c", COLOR_CREATE)
    allocate(fixed_dims, source=dims)
    fixed_dims(1) = int(dims(1) / 2, int32) + 1
    CHECK_OPTIONAL_CALL( self%create_c2c_internal(fixed_dims, comm, precision, effort, executor) )
    deallocate( fixed_dims )
    if ( self%is_transpose_plan ) ierr = DTFFT_ERROR_R2C_TRANSPOSE_PLAN
    CHECK_ERROR_AND_RETURN

    call self%real_pencil%create(self%ndims, 1_int8, dims, self%comms)
    fft_rank = FFT_1D;  if ( self%is_z_slab ) fft_rank = FFT_2D
    CHECK_OPTIONAL_CALL( self%fft(1)%fft%create(fft_rank, FFT_R2C, self%precision, real_pencil=self%real_pencil, complex_pencil=self%pencils(1)) )

    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
    self%is_created = .true.
    REGION_END("create_r2c")
  end subroutine create_r2c

  subroutine mem_alloc_ptr(self, alloc_bytes, ptr, error_code)
  !! Allocates memory specific for this plan
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    integer(int64),             intent(in)    :: alloc_bytes
      !! Number of bytes to allocate
    type(c_ptr),                intent(out)   :: ptr
      !! Allocated pointer
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    ptr = c_null_ptr
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN
    if ( alloc_bytes < FLOAT_STORAGE_SIZE ) ierr = DTFFT_ERROR_INVALID_ALLOC_BYTES
    CHECK_ERROR_AND_RETURN

    if ( self%platform == DTFFT_PLATFORM_HOST ) then
      if( self%is_transpose_plan ) then
        call mem_alloc_host(alloc_bytes, ptr)
      else
        call self%fft(1)%fft%mem_alloc(alloc_bytes, ptr)
      endif
      if ( is_null_ptr(ptr) ) ierr = DTFFT_ERROR_ALLOC_FAILED
#ifdef DTFFT_WITH_CUDA
    else
      call self%plan%mem_alloc(self%comm, alloc_bytes, ptr, ierr)
#endif
    endif
    CHECK_ERROR_AND_RETURN
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine mem_alloc_ptr

  subroutine  mem_free_ptr(self, ptr, error_code)
  !! Frees previously allocated memory specific for this plan
    class(dtfft_plan_t),        intent(inout) :: self
      !! Abstract plan
    type(c_ptr),                intent(in)    :: ptr
      !! Pointer allocated with [[dtfft_plan_t(type):mem_alloc]]
    integer(int32), optional,   intent(out)   :: error_code
      !! Optional error code returned to user
    integer(int32)  :: ierr     !! Error code

    ierr = DTFFT_SUCCESS
    if ( .not. self%is_created ) ierr = DTFFT_ERROR_PLAN_NOT_CREATED
    CHECK_ERROR_AND_RETURN

    if ( self%platform == DTFFT_PLATFORM_HOST ) then
      if( self%is_transpose_plan ) then
        call mem_free_host(ptr)
      else
        call self%fft(1)%fft%mem_free(ptr)
      endif
#ifdef DTFFT_WITH_CUDA
    else
      call self%plan%mem_free(ptr, ierr)
#endif
    endif
    CHECK_ERROR_AND_RETURN
    if ( present( error_code ) ) error_code = DTFFT_SUCCESS
  end subroutine mem_free_ptr

#define STORAGE_BYTES FLOAT_STORAGE_SIZE
#define BUFFER_TYPE real(real32)
#define ALLOC_1D mem_alloc_r32_1d
#define ALLOC_2D mem_alloc_r32_2d
#define ALLOC_3D mem_alloc_r32_3d
#define FREE_1D mem_free_r32_1d
#define FREE_2D mem_free_r32_2d
#define FREE_3D mem_free_r32_3d
#include "_dtfft_mem_alloc_free.inc"

#define STORAGE_BYTES DOUBLE_STORAGE_SIZE
#define BUFFER_TYPE real(real64)
#define ALLOC_1D mem_alloc_r64_1d
#define ALLOC_2D mem_alloc_r64_2d
#define ALLOC_3D mem_alloc_r64_3d
#define FREE_1D mem_free_r64_1d
#define FREE_2D mem_free_r64_2d
#define FREE_3D mem_free_r64_3d
#include "_dtfft_mem_alloc_free.inc"

#define STORAGE_BYTES COMPLEX_STORAGE_SIZE
#define BUFFER_TYPE complex(real32)
#define ALLOC_1D mem_alloc_c32_1d
#define ALLOC_2D mem_alloc_c32_2d
#define ALLOC_3D mem_alloc_c32_3d
#define FREE_1D mem_free_c32_1d
#define FREE_2D mem_free_c32_2d
#define FREE_3D mem_free_c32_3d
#include "_dtfft_mem_alloc_free.inc"

#define STORAGE_BYTES DOUBLE_COMPLEX_STORAGE_SIZE
#define BUFFER_TYPE complex(real64)
#define ALLOC_1D mem_alloc_c64_1d
#define ALLOC_2D mem_alloc_c64_2d
#define ALLOC_3D mem_alloc_c64_3d
#define FREE_1D mem_free_c64_1d
#define FREE_2D mem_free_c64_2d
#define FREE_3D mem_free_c64_3d
#include "_dtfft_mem_alloc_free.inc"
end module dtfft_plan