dtfft_kernel_host.F90 Source File


This file depends on

sourcefile~~dtfft_kernel_host.f90~~EfferentGraph sourcefile~dtfft_kernel_host.f90 dtfft_kernel_host.F90 sourcefile~dtfft_abstract_kernel.f90 dtfft_abstract_kernel.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_abstract_kernel.f90 sourcefile~dtfft_config.f90 dtfft_config.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_config.f90 sourcefile~dtfft_interface_nvtx.f90 dtfft_interface_nvtx.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_parameters.f90 dtfft_parameters.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90 dtfft_utils.F90 sourcefile~dtfft_kernel_host.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_interface_nvtx.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_abstract_kernel.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_config.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_errors.f90 dtfft_errors.F90 sourcefile~dtfft_config.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda_runtime.f90 dtfft_interface_cuda_runtime.F90 sourcefile~dtfft_config.f90->sourcefile~dtfft_interface_cuda_runtime.f90 sourcefile~dtfft_interface_nvtx.f90->sourcefile~dtfft_utils.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_utils.f90->sourcefile~dtfft_errors.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_parameters.f90 sourcefile~dtfft_interface_cuda_runtime.f90->sourcefile~dtfft_utils.f90

Files dependent on this one

sourcefile~~dtfft_kernel_host.f90~~AfferentGraph sourcefile~dtfft_kernel_host.f90 dtfft_kernel_host.F90 sourcefile~dtfft_transpose_handle_generic.f90 dtfft_transpose_handle_generic.F90 sourcefile~dtfft_transpose_handle_generic.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~test_host_kernels.f90 test_host_kernels.F90 sourcefile~test_host_kernels.f90->sourcefile~dtfft_kernel_host.f90 sourcefile~dtfft_transpose_plan.f90 dtfft_transpose_plan.F90 sourcefile~dtfft_transpose_plan.f90->sourcefile~dtfft_transpose_handle_generic.f90 sourcefile~dtfft_plan.f90 dtfft_plan.F90 sourcefile~dtfft_plan.f90->sourcefile~dtfft_transpose_plan.f90 sourcefile~dtfft.f90 dtfft.F90 sourcefile~dtfft.f90->sourcefile~dtfft_plan.f90 sourcefile~dtfft_api.f90 dtfft_api.F90 sourcefile~dtfft_api.f90->sourcefile~dtfft_plan.f90

Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_kernel_host
!! This module defines `kernel_host` type and its type bound procedures.
!! The host kernel is an implementation of the `abstract_kernel` type
!! that runs on the host CPU.
use iso_c_binding
use iso_fortran_env
use dtfft_abstract_kernel
use dtfft_config
use dtfft_parameters
use dtfft_utils
#include "_dtfft_mpi.h"
#include "_dtfft_private.h"
#include "_dtfft_profile.h"
implicit none
private
public :: kernel_host

! Exporting internal kernels for testing purposes
public :: unpack_pipelined_f32
public :: permute_forward_write_f32, permute_backward_write_f32
public :: permute_backward_start_write_f32
public :: permute_forward_write_f32_block_16, permute_backward_write_f32_block_16
public :: permute_backward_start_write_f32_block_16
public :: permute_forward_write_f32_block_32, permute_backward_write_f32_block_32
public :: permute_backward_start_write_f32_block_32
public :: permute_forward_write_f32_block_64, permute_backward_write_f32_block_64
public :: permute_backward_start_write_f32_block_64
public :: permute_forward_read_f32, permute_backward_read_f32
public :: permute_backward_start_read_f32
public :: permute_forward_read_f32_block_16, permute_backward_read_f32_block_16
public :: permute_backward_start_read_f32_block_16
public :: permute_forward_read_f32_block_32, permute_backward_read_f32_block_32
public :: permute_backward_start_read_f32_block_32
public :: permute_forward_read_f32_block_64, permute_backward_read_f32_block_64
public :: permute_backward_start_read_f32_block_64
public :: permute_backward_end_pipelined_write_f32, permute_backward_end_pipelined_read_f32
public :: permute_backward_end_pipelined_write_f32_block_16
public :: permute_backward_end_pipelined_read_f32_block_16
public :: permute_backward_end_pipelined_write_f32_block_32
public :: permute_backward_end_pipelined_read_f32_block_32
public :: permute_backward_end_pipelined_write_f32_block_64
public :: permute_backward_end_pipelined_read_f32_block_64
public :: unpack_pipelined_f32_block_16
public :: unpack_pipelined_f32_block_32
public :: unpack_pipelined_f32_block_64

integer(int8), parameter :: ACCESS_MODE_WRITE = -1_int8
    !! Aligned writing
integer(int8), parameter :: ACCESS_MODE_READ  = +1_int8
    !! Aligned reading

integer(int8), parameter :: DEFAULT_ACCESS_MODE = ACCESS_MODE_WRITE
    !! Assuming that aligned writing is more important then aligned reading

type :: host_kernel_t
    integer(int8) :: val
end type host_kernel_t

type(host_kernel_t), parameter :: HOST_KERNEL_BASE = host_kernel_t(1_int8)
    !! Base host kernel type
type(host_kernel_t), parameter :: HOST_KERNEL_BLOCK_16 = host_kernel_t(2_int8)
    !! Host kernel with block size of 16
type(host_kernel_t), parameter :: HOST_KERNEL_BLOCK_32 = host_kernel_t(3_int8)
    !! Host kernel with block size of 32
type(host_kernel_t), parameter :: HOST_KERNEL_BLOCK_64 = host_kernel_t(4_int8)
    !! Host kernel with block size of 64

interface operator(==)
    module procedure host_kernel_eq
end interface

type, extends(abstract_kernel) :: kernel_host
  !! Host kernel implementation
    integer(int8) :: access_mode
      !! Access mode for kernel execution
    procedure(execute_host_interface), pointer :: execute_impl => null()
      !! Pointer to the execute implementation
contains
    procedure :: create_private => create_host    !! Creates kernel
    procedure :: execute_private => execute_host  !! Executes kernel
    procedure :: destroy_private => destroy_host  !! Destroys kernel
    procedure :: execute_benchmark
    procedure :: select_access_mode_f32
    procedure :: select_access_mode_f64
    procedure :: select_access_mode_f128
end type kernel_host

abstract interface
    subroutine execute_host_interface(self, in, out, neighbor)
    !! Executes the given kernel on host
        import
        class(kernel_host),         intent(in)      :: self     !! Host kernel class
        real(real32),     target,   intent(in)    :: in(:)      !! Source host-allocated buffer
        real(real32),     target,   intent(inout) :: out(:)     !! Target host-allocated buffer
        integer(int32), optional,   intent(in)      :: neighbor !! Source rank for pipelined unpacking
    end subroutine execute_host_interface
end interface

contains

subroutine create_host(self, effort, base_storage, force_effort)
!! Creates host kernel
    class(kernel_host),     intent(inout)   :: self         !! Host kernel class
    type(dtfft_effort_t),   intent(in)      :: effort       !! Effort level for generating transpose kernels
    integer(int64),         intent(in)      :: base_storage !! Number of bytes needed to store single element
    logical,    optional,   intent(in)      :: force_effort !! Should effort be forced or not
    logical                         :: force_effort_    !! Local copy of force_effort
    integer(int32)                  :: n_iters          !! Number of iterations to perform when testing kernel
    integer(int32)                  :: n_warmup_iters   !! Number of warmup iterations to perform before testing kernel
    real(real32)                    :: best_time        !! Best execution time
    real(real64)                    :: execution_time   !! Execution time
    integer(int8)                   :: test_id          !! Current test configuration id
    integer(int8)                   :: max_tests        !! Maximum number of tests to perform
    type(host_kernel_t)             :: current_kernel   !! Current test configuration
    type(host_kernel_t)             :: best_kernel      !! Best kernel type
    character(len=:),   allocatable :: global_phase     !! Global phase name for profiling
    character(len=:),   allocatable :: local_phase      !! Local phase name for profiling
    real(real32)                    :: bandwidth        !! Bandwidth for kernel execution
    integer(int32)                  :: ndims            !! Number of dimensions
    integer(int32),     allocatable :: fixed_dims(:)    !! Fixed dimensions for bandwidth calculation
    real(real32),       allocatable :: in(:), out(:)    !! Host buffers for benchmarking
    type(kernel_type_t)             :: temp_kernel_type !! Temporary storage for kernel type

    self%access_mode = DEFAULT_ACCESS_MODE

    force_effort_ = .false.; if (present(force_effort)) force_effort_ = force_effort
    if ((effort == DTFFT_ESTIMATE .and. force_effort_) .or. &
          .not. ( (effort == DTFFT_PATIENT .and. get_conf_kernel_optimization_enabled()) .or. get_conf_forced_kernel_optimization()) ) then
        self%execute_impl => select_kernel(HOST_KERNEL_BASE, base_storage)
        return
    end if

    n_warmup_iters = get_conf_measure_warmup_iters()
    n_iters = get_conf_measure_iters()
    best_time = MAX_REAL32

    ndims = size(self%dims)
    allocate (fixed_dims(ndims))
    fixed_dims(1:ndims) = self%dims(1:ndims)
    if (is_unpack_kernel(self%kernel_type)) fixed_dims(1:ndims) = self%neighbor_data(1:ndims, 1)

    allocate (in(base_storage * product(self%dims) / FLOAT_STORAGE_SIZE))
    allocate (out(base_storage * product(self%dims) / FLOAT_STORAGE_SIZE))

    temp_kernel_type = self%kernel_type
    if (self%kernel_type == KERNEL_PERMUTE_BACKWARD_END) then
        self%kernel_type = KERNEL_PERMUTE_BACKWARD_END_PIPELINED
    else if (self%kernel_type == KERNEL_UNPACK) then
        self%kernel_type = KERNEL_UNPACK_PIPELINED
    end if

    global_phase = "Benchmarking kernel: '"//self%kernel_string//"'"
    PHASE_BEGIN(global_phase, COLOR_AUTOTUNE)
    WRITE_INFO(global_phase)

    max_tests = int(get_conf_configs_to_test(), int8)

    do test_id = 1_int8, min(max_tests, 4_int8)
        current_kernel = host_kernel_t(test_id)

        self%execute_impl => select_kernel(current_kernel, base_storage)

        if (current_kernel == HOST_KERNEL_BASE .and. .not. (any(self%kernel_type == [KERNEL_UNPACK, KERNEL_UNPACK_PIPELINED]))) then
            local_phase = "Selecting access mode"
            REGION_BEGIN(local_phase, COLOR_AUTOTUNE2)
            WRITE_INFO("    "//local_phase)

            select case (base_storage)
            case (FLOAT_STORAGE_SIZE)
                call self%select_access_mode_f32(in, out, n_warmup_iters, n_iters, execution_time)
            case (DOUBLE_STORAGE_SIZE)
                call self%select_access_mode_f64(in, out, n_warmup_iters, n_iters, execution_time)
            case (DOUBLE_COMPLEX_STORAGE_SIZE)
                call self%select_access_mode_f128(in, out, n_warmup_iters, n_iters, execution_time)
            end select
        else
            local_phase = "Testing kernel "//get_host_kernel_string(current_kernel)
            REGION_BEGIN(local_phase, COLOR_AUTOTUNE2)
            WRITE_INFO("    "//local_phase)

            call self%execute_benchmark(in, out, n_warmup_iters, n_iters, execution_time)
        end if

        WRITE_INFO("        Average execution time = "//to_str(execution_time)//" [ms]")
        if (execution_time > 0._real64) then
        bandwidth = 2._real32 * 1000._real32 * real(base_storage * product(fixed_dims), real32) / real(1024 * 1024 * 1024, real32) / real(execution_time, real32)
            WRITE_INFO("        Bandwidth = "//to_str(bandwidth)//" [GB/s]")
        end if

        if (execution_time < best_time) then
            best_time = real(execution_time, real32)
            best_kernel = current_kernel
        end if

        REGION_END(local_phase)
    end do
    WRITE_INFO("  Selected kernel: "//get_host_kernel_string(best_kernel))

    self%kernel_type = temp_kernel_type
    self%execute_impl => select_kernel(best_kernel, base_storage)

    PHASE_END(global_phase)
    deallocate (fixed_dims)
    deallocate (in, out)
    deallocate (global_phase, local_phase)
end subroutine create_host

subroutine execute_benchmark(self, in, out, n_warmup_iters, n_iters, execution_time)
!! Executes benchmark for the given kernel
    class(kernel_host), intent(inout)   :: self           !! Host kernel class
    real(real32),       intent(in)      :: in(:)          !! Source host-allocated buffer
    real(real32),       intent(inout)   :: out(:)         !! Target host-allocated buffer
    integer(int32),     intent(in)      :: n_warmup_iters !! Number of warmup iterations to perform before testing kernel
    integer(int32),     intent(in)      :: n_iters        !! Number of iterations to perform when testing kernel
    real(real64),       intent(out)     :: execution_time !! Execution time of the selected access
    integer(int32) :: iter
    real(real64) :: start_time, end_time

#ifdef DTFFT_DEBUG
    if (.not. associated(self%execute_impl)) then
        INTERNAL_ERROR("kernel_host%execute_benchmark: Kernel execute implementation is not associated!")
    end if
#endif

    REGION_BEGIN("Warmup", COLOR_TRANSPOSE)
    do iter = 1, n_warmup_iters
        call self%execute_impl(in, out, 1)
    end do
    REGION_END("Warmup")

    REGION_BEGIN("Measure", COLOR_EXECUTE)
    call cpu_time(start_time)
    do iter = 1, n_iters
        call self%execute_impl(in, out, 1)
    end do
    call cpu_time(end_time)
    execution_time = 1000._real64 * (end_time - start_time) / real(n_iters, real64)
    REGION_END("Measure")
end subroutine execute_benchmark

subroutine execute_host(self, in, out, stream, neighbor)
!! Executes host kernel
    class(kernel_host),         intent(inout)   :: self       !! Host kernel class
    real(real32),   target,     intent(in)      :: in(:)      !! Source host-allocated buffer
    real(real32),   target,     intent(inout)   :: out(:)     !! Target host-allocated buffer
    type(dtfft_stream_t),       intent(in)      :: stream     !! Stream to execute on, unused here
    integer(int32), optional,   intent(in)      :: neighbor   !! Source rank for pipelined unpacking

#ifdef DTFFT_DEBUG
    if (.not. associated(self%execute_impl)) then
        INTERNAL_ERROR("kernel_host%execute_host: Kernel execute implementation is not associated!")
    end if
#endif

    call self%execute_impl(in, out, neighbor)
end subroutine execute_host

subroutine destroy_host(self)
!! Destroys host kernel
    class(kernel_host), intent(inout) :: self !! Host kernel class

    nullify (self%execute_impl)
end subroutine destroy_host

function select_kernel(kernel, base_storage) result(fun)
!! Selects the kernel implementation based on the given id and base storage size
    type(host_kernel_t), intent(in) :: kernel           !! Kernel id
    integer(int64),      intent(in) :: base_storage     !! Size of single element in bytes
    procedure(execute_host_interface), pointer :: fun   !! Selected kernel implementation

    select case (kernel%val)
    case (HOST_KERNEL_BASE%val)
        select case (base_storage)
        case (FLOAT_STORAGE_SIZE)
            fun => execute_f32
        case (DOUBLE_STORAGE_SIZE)
            fun => execute_f64
        case (DOUBLE_COMPLEX_STORAGE_SIZE)
            fun => execute_f128
        end select
    case (HOST_KERNEL_BLOCK_16%val)
        select case (base_storage)
        case (FLOAT_STORAGE_SIZE)
            fun => execute_f32_block_16
        case (DOUBLE_STORAGE_SIZE)
            fun => execute_f64_block_16
        case (DOUBLE_COMPLEX_STORAGE_SIZE)
            fun => execute_f128_block_16
        end select
    case (HOST_KERNEL_BLOCK_32%val)
        select case (base_storage)
        case (FLOAT_STORAGE_SIZE)
            fun => execute_f32_block_32
        case (DOUBLE_STORAGE_SIZE)
            fun => execute_f64_block_32
        case (DOUBLE_COMPLEX_STORAGE_SIZE)
            fun => execute_f128_block_32
        end select
    case (HOST_KERNEL_BLOCK_64%val)
        select case (base_storage)
        case (FLOAT_STORAGE_SIZE)
            fun => execute_f32_block_64
        case (DOUBLE_STORAGE_SIZE)
            fun => execute_f64_block_64
        case (DOUBLE_COMPLEX_STORAGE_SIZE)
            fun => execute_f128_block_64
        end select
    end select
end function select_kernel

function get_host_kernel_string(kernel) result(kernel_string)
!! Returns string representation of the given host kernel type
    type(host_kernel_t), intent(in) :: kernel       !! Host kernel type
    character(len=:),   allocatable :: kernel_string !! String representation of the kernel

    select case (kernel%val)
    case (HOST_KERNEL_BASE%val)
        kernel_string = "BASE"
    case (HOST_KERNEL_BLOCK_16%val)
        kernel_string = "BLOCK_16"
    case (HOST_KERNEL_BLOCK_32%val)
        kernel_string = "BLOCK_32"
    case (HOST_KERNEL_BLOCK_64%val)
        kernel_string = "BLOCK_64"
    case default
        kernel_string = "UNKNOWN"
    end select
end function get_host_kernel_string

MAKE_EQ_FUN(host_kernel_t, host_kernel_eq)

#define PREC _f128
#define BUFFER_TYPE complex(real64)
#define STORAGE_BYTES DOUBLE_COMPLEX_STORAGE_SIZE
#include "_dtfft_kernel_host_routines.inc"

#define PREC _f64
#define BUFFER_TYPE real(real64)
#define STORAGE_BYTES DOUBLE_STORAGE_SIZE
#include "_dtfft_kernel_host_routines.inc"

#define PREC _f32
#define BUFFER_TYPE real(real32)
#define STORAGE_BYTES FLOAT_STORAGE_SIZE
#include "_dtfft_kernel_host_routines.inc"
end module dtfft_kernel_host