dtfft_kernel_host.F90 Source File


This file depends on

sourcefile~~dtfft_kernel_host.f90~~EfferentGraph sourcefile~dtfft_kernel_host.f90 dtfft_kernel_host.F90 sourcefile~dtfft_abstract_kernel.f90 dtfft_abstract_kernel.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_compressor.f90 dtfft_abstract_compressor.F90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_abstract_compressor.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_abstract_compressor.f90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_config.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_abstract_compressor.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_compressor.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_compressor.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_compressor.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90

Files dependent on this one

sourcefile~~dtfft_kernel_host.f90~~AfferentGraph sourcefile~dtfft_kernel_host.f90 dtfft_kernel_host.F90 sourcefile~dtfft_reshape_handle_generic.f90 dtfft_reshape_handle_generic.F90 sourcefile~dtfft_reshape_handle_generic.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~test_compression.f90 test_compression.F90 sourcefile~test_compression.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~test_device_kernels.f90 test_device_kernels.F90 sourcefile~test_device_kernels.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~test_host_kernels.f90 test_host_kernels.F90 sourcefile~test_host_kernels.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~dtfft_reshape_plan_base.f90 dtfft_reshape_plan_base.F90 sourcefile~dtfft_reshape_plan_base.f90->sourcefile~dtfft_reshape_handle_generic.f90 sourcefile~dtfft_reshape_plan.f90 dtfft_reshape_plan.F90 sourcefile~dtfft_reshape_plan.f90->sourcefile~dtfft_reshape_plan_base.f90 sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_reshape_plan_base.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_reshape_plan.f90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_kernel_host
!! This module defines `kernel_host` type and its type bound procedures.
!! The host kernel is an implementation of the `abstract_kernel` type
!! that runs on the host CPU.
use iso_c_binding
use iso_fortran_env
use dtfft_abstract_kernel
use dtfft_config
use dtfft_parameters
use dtfft_utils
#include "_dtfft_mpi.h"
#include "_dtfft_private.h"
#include "_dtfft_profile.h"
implicit none
private
public :: kernel_host

! Exporting internal kernels for testing purposes
public :: unpack_f32
public :: permute_forward_write_f32, permute_backward_write_f32
public :: permute_backward_start_write_f32
public :: permute_forward_write_f32_block_4, permute_backward_write_f32_block_4
public :: permute_backward_start_write_f32_block_4
public :: permute_forward_write_f32_block_8, permute_backward_write_f32_block_8
public :: permute_backward_start_write_f32_block_8
public :: permute_forward_write_f32_block_16, permute_backward_write_f32_block_16
public :: permute_backward_start_write_f32_block_16
public :: permute_forward_write_f32_block_32, permute_backward_write_f32_block_32
public :: permute_backward_start_write_f32_block_32
public :: permute_forward_write_f32_block_64, permute_backward_write_f32_block_64
public :: permute_backward_start_write_f32_block_64
public :: permute_forward_read_f32, permute_backward_read_f32
public :: permute_backward_start_read_f32
public :: permute_forward_read_f32_block_4, permute_backward_read_f32_block_4
public :: permute_backward_start_read_f32_block_4
public :: permute_forward_read_f32_block_8, permute_backward_read_f32_block_8
public :: permute_backward_start_read_f32_block_8
public :: permute_forward_read_f32_block_16, permute_backward_read_f32_block_16
public :: permute_backward_start_read_f32_block_16
public :: permute_forward_read_f32_block_32, permute_backward_read_f32_block_32
public :: permute_backward_start_read_f32_block_32
public :: permute_forward_read_f32_block_64, permute_backward_read_f32_block_64
public :: permute_backward_start_read_f32_block_64
public :: permute_backward_end_write_f32, permute_backward_end_read_f32
public :: permute_backward_end_write_f32_block_4
public :: permute_backward_end_write_f32_block_8
public :: permute_backward_end_write_f32_block_16
public :: permute_backward_end_write_f32_block_32
public :: permute_backward_end_write_f32_block_64
public :: permute_backward_end_read_f32_block_4
public :: permute_backward_end_read_f32_block_8
public :: permute_backward_end_read_f32_block_16
public :: permute_backward_end_read_f32_block_32
public :: permute_backward_end_read_f32_block_64
public :: unpack_f32_block_4
public :: unpack_f32_block_8
public :: unpack_f32_block_16
public :: unpack_f32_block_32
public :: unpack_f32_block_64

public :: pack_forward_write_f32, pack_backward_write_f32
public :: pack_forward_write_f32_block_4, pack_backward_write_f32_block_4
public :: pack_forward_write_f32_block_8, pack_backward_write_f32_block_8
public :: pack_forward_write_f32_block_16, pack_backward_write_f32_block_16
public :: pack_forward_write_f32_block_32, pack_backward_write_f32_block_32
public :: pack_forward_write_f32_block_64, pack_backward_write_f32_block_64
public :: pack_forward_read_f32, pack_backward_read_f32
public :: pack_forward_read_f32_block_4, pack_backward_read_f32_block_4
public :: pack_forward_read_f32_block_8, pack_backward_read_f32_block_8
public :: pack_forward_read_f32_block_16, pack_backward_read_f32_block_16
public :: pack_forward_read_f32_block_32, pack_backward_read_f32_block_32
public :: pack_forward_read_f32_block_64, pack_backward_read_f32_block_64

public :: unpack_forward_write_f32, unpack_forward_read_f32
public :: unpack_forward_write_f32_block_4, unpack_forward_read_f32_block_4
public :: unpack_forward_write_f32_block_8, unpack_forward_read_f32_block_8
public :: unpack_forward_write_f32_block_16, unpack_forward_read_f32_block_16
public :: unpack_forward_write_f32_block_32, unpack_forward_read_f32_block_32
public :: unpack_forward_write_f32_block_64, unpack_forward_read_f32_block_64

public :: unpack_backward_write_f32, unpack_backward_read_f32
public :: unpack_backward_write_f32_block_4, unpack_backward_read_f32_block_4
public :: unpack_backward_write_f32_block_8, unpack_backward_read_f32_block_8
public :: unpack_backward_write_f32_block_16, unpack_backward_read_f32_block_16
public :: unpack_backward_write_f32_block_32, unpack_backward_read_f32_block_32
public :: unpack_backward_write_f32_block_64, unpack_backward_read_f32_block_64

public :: pack_f32
public :: pack_f32_block_4, pack_f32_block_8
public :: pack_f32_block_16
public :: pack_f32_block_32, pack_f32_block_64

#ifdef DTFFT_WITH_MOCK_ENABLED
public :: permute_forward_write_f64, permute_forward_write_f128
public :: permute_backward_end_pipelined_write_f32, permute_backward_write_f64, permute_backward_write_f128
public :: permute_backward_start_write_f64, permute_backward_start_write_f128
public :: permute_backward_end_pipelined_write_f64, permute_backward_end_pipelined_write_f128
public :: permute_backward_end_write_f64, permute_backward_end_write_f128
public :: unpack_pipelined_f32, unpack_pipelined_f64, unpack_pipelined_f128
public :: pack_pipelined_f32, pack_pipelined_f64, pack_pipelined_f128
public :: pack_forward_write_f64, pack_forward_write_f128
public :: pack_backward_write_f64, pack_backward_write_f128
public :: copy_f32, copy_f64, copy_f128
#endif

    type :: host_kernel_t
        integer(int8) :: val
    end type host_kernel_t

    type(host_kernel_t), parameter, public :: HOST_KERNEL_UNSET = host_kernel_t(-1_int8)

    type(host_kernel_t), parameter, public :: HOST_KERNEL_BASE = host_kernel_t(1_int8)
    !! Base host kernel type
    type(host_kernel_t), parameter, public :: HOST_KERNEL_BLOCK_4 = host_kernel_t(2_int8)
    !! Host kernel with block size of 4
    type(host_kernel_t), parameter, public :: HOST_KERNEL_BLOCK_8 = host_kernel_t(3_int8)
    !! Host kernel with block size of 8
    type(host_kernel_t), parameter, public :: HOST_KERNEL_BLOCK_16 = host_kernel_t(4_int8)
    !! Host kernel with block size of 16
    type(host_kernel_t), parameter, public :: HOST_KERNEL_BLOCK_32 = host_kernel_t(5_int8)
    !! Host kernel with block size of 32
    type(host_kernel_t), parameter, public :: HOST_KERNEL_BLOCK_64 = host_kernel_t(6_int8)
    !! Host kernel with block size of 64

    interface operator(==)
        module procedure host_kernel_eq
    end interface

    type, extends(abstract_kernel) :: kernel_host
    !! Host kernel implementation
    private
        type(dtfft_access_mode_t) :: access_mode
        !! Access mode for kernel execution
        procedure(execute_host_interface), pointer :: execute_impl => null()
        !! Pointer to the execute implementation
    contains
        procedure :: create_private => create_host    !! Creates kernel
        procedure :: execute_private => execute_host  !! Executes kernel
        procedure :: destroy_private => destroy_host  !! Destroys kernel
        procedure :: execute_benchmark
        procedure :: select_access_mode_f32
        procedure :: select_access_mode_f64
        procedure :: select_access_mode_f128
    end type kernel_host

    abstract interface
        subroutine execute_host_interface(self, in, out, neighbor)
        !! Executes the given kernel on host
            import
            class(kernel_host),         intent(in)      :: self     !! Host kernel class
            type(c_ptr),                intent(in)      :: in       !! Source host-allocated buffer
            type(c_ptr),                intent(in)      :: out      !! Target host-allocated buffer
            integer(int32), optional,   intent(in)      :: neighbor !! Source rank for pipelined unpacking
        end subroutine execute_host_interface
    end interface

contains

    subroutine create_host(self, effort, base_storage, force_effort)
    !! Creates host kernel
        class(kernel_host),     intent(inout)   :: self         !! Host kernel class
        type(dtfft_effort_t),   intent(in)      :: effort       !! Effort level for generating transpose kernels
        integer(int64),         intent(in)      :: base_storage !! Number of bytes needed to store single element
        logical,    optional,   intent(in)      :: force_effort !! Should effort be forced or not
        logical                         :: force_effort_    !! Local copy of force_effort
        integer(int32)                  :: n_iters          !! Number of iterations to perform when testing kernel
        integer(int32)                  :: n_warmup_iters   !! Number of warmup iterations to perform before testing kernel
        real(real32)                    :: best_time        !! Best execution time
        real(real64)                    :: execution_time   !! Execution time
        integer(int8)                   :: test_id          !! Current test configuration id
        type(host_kernel_t)             :: current_kernel   !! Current test configuration
        type(host_kernel_t)             :: best_kernel      !! Best kernel type
        character(len=:),   allocatable :: global_phase     !! Global phase name for profiling
        character(len=:),   allocatable :: local_phase      !! Local phase name for profiling
        real(real32)                    :: bandwidth        !! Bandwidth for kernel execution
        integer(int32)                  :: ndims            !! Number of dimensions
        integer(int32),     allocatable :: fixed_dims(:)    !! Fixed dimensions for bandwidth calculation
        ! real(real32),       allocatable :: in(:), out(:)    !! Host buffers for benchmarking
        type(c_ptr) :: in, out
        type(kernel_type_t)             :: temp_kernel_type !! Temporary storage for kernel type

        self%access_mode = get_conf_access_mode()

        force_effort_ = .false.; if (present(force_effort)) force_effort_ = force_effort
        if ((effort == DTFFT_ESTIMATE .and. force_effort_) &
            .or. .not. ( effort == DTFFT_EXHAUSTIVE .or. get_conf_kernel_autotune_enabled()) &
            .or. any(self%kernel_type == [KERNEL_COPY, KERNEL_COPY_PIPELINED]) ) then
            self%execute_impl => select_kernel(HOST_KERNEL_BASE, base_storage)
            return
        end if

        n_warmup_iters = get_conf_measure_warmup_iters()
        n_iters = get_conf_measure_iters()
        best_time = MAX_REAL32

        ndims = size(self%dims)
        allocate (fixed_dims(ndims))
        fixed_dims(1:ndims) = self%dims(1:ndims)
        if (is_unpack_kernel(self%kernel_type) .or. is_pack_kernel(self%kernel_type)) fixed_dims(1:ndims) = self%neighbor_data(1:ndims, 1)

        in = mem_alloc_host(base_storage * product(self%dims))
        out = mem_alloc_host(base_storage * product(self%dims))

        temp_kernel_type = self%kernel_type
        select case ( self%kernel_type%val )
        case ( KERNEL_PERMUTE_BACKWARD_END%val )
            self%kernel_type = KERNEL_PERMUTE_BACKWARD_END_PIPELINED
        case ( KERNEL_UNPACK%val )
            self%kernel_type = KERNEL_UNPACK_PIPELINED
        case ( KERNEL_PACK%val )
            self%kernel_type = KERNEL_PACK_PIPELINED
        case ( KERNEL_UNPACK_FORWARD%val )
            self%kernel_type = KERNEL_UNPACK_FORWARD_PIPELINED
        case ( KERNEL_UNPACK_BACKWARD%val )
            self%kernel_type = KERNEL_UNPACK_BACKWARD_PIPELINED
        endselect

        allocate( global_phase, source="Benchmarking kernel: '"//self%kernel_string%raw//"'" )
        PHASE_BEGIN(global_phase, COLOR_STEEL_BLUE)
        WRITE_INFO(global_phase)

        do test_id = HOST_KERNEL_BASE%val, HOST_KERNEL_BLOCK_64%val
            current_kernel = host_kernel_t(test_id)

            self%execute_impl => select_kernel(current_kernel, base_storage)

            if (current_kernel == HOST_KERNEL_BASE .and. .not. (any(self%kernel_type == [KERNEL_PACK_PIPELINED, KERNEL_UNPACK_PIPELINED]))) then
                allocate( local_phase, source="Selecting access mode" )
                REGION_BEGIN(local_phase, COLOR_ORCHID)
                WRITE_INFO("    "//local_phase)

                select case (base_storage)
                case (FLOAT_STORAGE_SIZE)
                    call self%select_access_mode_f32(in, out, n_warmup_iters, n_iters, execution_time)
                case (DOUBLE_STORAGE_SIZE)
                    call self%select_access_mode_f64(in, out, n_warmup_iters, n_iters, execution_time)
                case (DOUBLE_COMPLEX_STORAGE_SIZE)
                    call self%select_access_mode_f128(in, out, n_warmup_iters, n_iters, execution_time)
                end select
            else
                allocate( local_phase, source="Testing kernel "//get_host_kernel_string(current_kernel) )
                REGION_BEGIN(local_phase, COLOR_AUTOTUNE2)
                WRITE_INFO("    "//local_phase)

                call self%execute_benchmark(in, out, n_warmup_iters, n_iters, execution_time)
            end if

            WRITE_INFO("        Average execution time = "//to_str(execution_time)//" [ms]")
            if (execution_time > 0._real64) then
                bandwidth = 2._real32 * 1000._real32 * real(base_storage * product(fixed_dims), real32) / real(1024 * 1024 * 1024, real32) / real(execution_time, real32)
                WRITE_INFO("        Bandwidth = "//to_str(bandwidth)//" [GB/s]")
            end if

            if (execution_time < best_time) then
                best_time = real(execution_time, real32)
                best_kernel = current_kernel
            end if

            REGION_END(local_phase)
            deallocate( local_phase )
        end do
        WRITE_INFO("  Selected kernel: "//get_host_kernel_string(best_kernel))

        self%kernel_type = temp_kernel_type
        self%execute_impl => select_kernel(best_kernel, base_storage)

        PHASE_END(global_phase)
        deallocate (fixed_dims)
        call mem_free_host(in)
        call mem_free_host(out)
        deallocate (global_phase)
    end subroutine create_host

    subroutine execute_benchmark(self, in, out, n_warmup_iters, n_iters, execution_time)
    !! Executes benchmark for the given kernel
        class(kernel_host), intent(inout)   :: self             !! Host kernel class
        type(c_ptr),        intent(in)      :: in               !! Source host-allocated buffer
        type(c_ptr),        intent(in)      :: out              !! Target host-allocated buffer
        integer(int32),     intent(in)      :: n_warmup_iters   !! Number of warmup iterations to perform before testing kernel
        integer(int32),     intent(in)      :: n_iters          !! Number of iterations to perform when testing kernel
        real(real64),       intent(out)     :: execution_time   !! Execution time of the selected access
        integer(int32) :: iter
        real(real64) :: start_time, end_time

#ifdef DTFFT_DEBUG
        if (.not. associated(self%execute_impl)) then
            INTERNAL_ERROR("kernel_host%execute_benchmark: Kernel execute implementation is not associated!")
        end if
#endif

        REGION_BEGIN("Warmup", COLOR_VIOLET)
        do iter = 1, n_warmup_iters
            call self%execute_impl(in, out, 1)
        end do
        REGION_END("Warmup")

        REGION_BEGIN("Measure", COLOR_DODGER_BLUE)
        start_time = MPI_Wtime()
        do iter = 1, n_iters
            call self%execute_impl(in, out, 1)
        end do
        end_time = MPI_Wtime()
        execution_time = 1000._real64 * (end_time - start_time) / real(n_iters, real64)
        REGION_END("Measure")
    end subroutine execute_benchmark

    subroutine execute_host(self, in, out, stream, sync, neighbor)
    !! Executes host kernel
        class(kernel_host),         intent(inout)   :: self         !! Host kernel class
        type(c_ptr),                intent(in)      :: in           !! Source host-allocated buffer
        type(c_ptr),                intent(in)      :: out          !! Target host-allocated buffer
        type(dtfft_stream_t),       intent(in)      :: stream       !! Stream to execute on, unused here
        logical,                    intent(in)      :: sync         !! Sync stream after kernel execution, unused here
        integer(int32), optional,   intent(in)      :: neighbor     !! Source rank for pipelined unpacking

#ifdef DTFFT_DEBUG
        if (.not. associated(self%execute_impl)) then
            INTERNAL_ERROR("kernel_host%execute_host: Kernel execute implementation is not associated!")
        end if
#endif

        call self%execute_impl(in, out, neighbor)
    end subroutine execute_host

    subroutine destroy_host(self)
    !! Destroys host kernel
        class(kernel_host), intent(inout) :: self !! Host kernel class

        nullify (self%execute_impl)
    end subroutine destroy_host

    function select_kernel(kernel, base_storage) result(fun)
    !! Selects the kernel implementation based on the given id and base storage size
        type(host_kernel_t), intent(in) :: kernel           !! Kernel id
        integer(int64),      intent(in) :: base_storage     !! Size of single element in bytes
        procedure(execute_host_interface), pointer :: fun   !! Selected kernel implementation

        fun => null()
        select case (kernel%val)
        case (HOST_KERNEL_BASE%val)
            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                fun => execute_f32
            case (DOUBLE_STORAGE_SIZE)
                fun => execute_f64
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                fun => execute_f128
            case default
                INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
            end select
        case (HOST_KERNEL_BLOCK_4%val)
            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                fun => execute_f32_block_4
            case (DOUBLE_STORAGE_SIZE)
                fun => execute_f64_block_4
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                fun => execute_f128_block_4
            case default
                INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
            end select
        case (HOST_KERNEL_BLOCK_8%val)
            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                fun => execute_f32_block_8
            case (DOUBLE_STORAGE_SIZE)
                fun => execute_f64_block_8
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                fun => execute_f128_block_8
            case default
                INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
            end select
        case (HOST_KERNEL_BLOCK_16%val)
            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                fun => execute_f32_block_16
            case (DOUBLE_STORAGE_SIZE)
                fun => execute_f64_block_16
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                fun => execute_f128_block_16
            case default
                INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
            end select
        case (HOST_KERNEL_BLOCK_32%val)
            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                fun => execute_f32_block_32
            case (DOUBLE_STORAGE_SIZE)
                fun => execute_f64_block_32
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                fun => execute_f128_block_32
            case default
                INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
            end select
        case (HOST_KERNEL_BLOCK_64%val)
            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                fun => execute_f32_block_64
            case (DOUBLE_STORAGE_SIZE)
                fun => execute_f64_block_64
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                fun => execute_f128_block_64
            case default
                INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
            end select
        case default
            INTERNAL_ERROR("select_kernel: Unknown kernel "//to_str(kernel%val)//" "//to_str(base_storage))
        end select
    end function select_kernel

    function get_host_kernel_string(kernel) result(kernel_string)
    !! Returns string representation of the given host kernel type
        type(host_kernel_t), intent(in) :: kernel       !! Host kernel type
        character(len=:),   allocatable :: kernel_string !! String representation of the kernel

        select case (kernel%val)
        case (HOST_KERNEL_BASE%val)
            allocate( kernel_string, source="BASE" )
        case (HOST_KERNEL_BLOCK_4%val)
            allocate( kernel_string, source="BLOCK_4" )
        case (HOST_KERNEL_BLOCK_8%val)
            allocate( kernel_string, source="BLOCK_8" )
        case (HOST_KERNEL_BLOCK_16%val)
            allocate( kernel_string, source="BLOCK_16" )
        case (HOST_KERNEL_BLOCK_32%val)
            allocate( kernel_string, source="BLOCK_32" )
        case (HOST_KERNEL_BLOCK_64%val)
            allocate( kernel_string, source="BLOCK_64" )
        case default
            allocate( kernel_string, source="UNKNOWN" )
        end select
    end function get_host_kernel_string

    MAKE_EQ_FUN(host_kernel_t, host_kernel_eq)

#define PREC _f128
#define BUFFER_TYPE complex(real64)
#define STORAGE_BYTES DOUBLE_COMPLEX_STORAGE_SIZE
#include "_dtfft_kernel_host_routines.inc"

#define PREC _f64
#define BUFFER_TYPE real(real64)
#define STORAGE_BYTES DOUBLE_STORAGE_SIZE
#include "_dtfft_kernel_host_routines.inc"

#define PREC _f32
#define BUFFER_TYPE real(real32)
#define STORAGE_BYTES FLOAT_STORAGE_SIZE
#include "_dtfft_kernel_host_routines.inc"
end module dtfft_kernel_host