dtfft_abstract_backend.F90

This file depends on

Help
Files dependent on this one

Help
Source Code

!------------------------------------------------------------------------------------------------
! Copyright (c) 2021 - 2025, Oleg Shatrov
! All rights reserved.
! This file is part of dtFFT library.

! dtFFT is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.

! dtFFT is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.

! You should have received a copy of the GNU General Public License
! along with this program.  If not, see <https://www.gnu.org/licenses/>.
!------------------------------------------------------------------------------------------------
#include "dtfft_config.h"
module dtfft_abstract_backend
!! This module describes Abstraction for all GPU Backends: [[abstract_backend]]
use iso_c_binding
use iso_fortran_env
use dtfft_config, only: get_env
use dtfft_interface_cuda_runtime
#ifdef DTFFT_WITH_NCCL
use dtfft_interface_nccl
#endif
use dtfft_nvrtc_kernel,   only: nvrtc_kernel
use dtfft_parameters
use dtfft_pencil,         only: pencil
use dtfft_utils
#include "_dtfft_mpi.h"
#include "_dtfft_cuda.h"
#include "_dtfft_private.h"
implicit none
private
public :: abstract_backend, backend_helper

#ifdef NCCL_HAVE_COMMREGISTER
  integer(int32), parameter, public :: NCCL_REGISTER_PREALLOC_SIZE = 8
#endif

  type :: backend_helper
  !! Helper with nccl, mpi and nvshmem communicators
    logical                     :: is_nccl_created = .false.    !! Flag is `nccl_comm` has been created
#ifdef DTFFT_WITH_NCCL
    type(ncclComm)              :: nccl_comm                    !! NCCL communicator
#endif
#ifdef NCCL_HAVE_COMMREGISTER
    logical                     :: should_register              !! If NCCL buffer should be registered
    type(c_ptr),    allocatable :: nccl_register(:,:)           !! NCCL register cache
    integer(int32)              :: nccl_register_size           !! Number of elements in `nccl_register`
#endif
    TYPE_MPI_COMM,  allocatable :: comms(:)                     !! MPI communicators
    integer(int32), allocatable :: comm_mappings(:,:)           !! Mapping of 1d comm ranks to global comm
    type(dtfft_transpose_t)     :: tranpose_type                !! Type of transpose to create
    type(pencil),   pointer     :: pencils(:)                   !! Pencils
  contains
    procedure,  pass(self) :: create => create_helper           !! Creates helper
    procedure,  pass(self) :: destroy => destroy_helper         !! Destroys helper
  end type backend_helper

  type, abstract :: abstract_backend
  !! The most Abstract GPU Backend
    type(dtfft_backend_t)             :: backend                !! Backend type
    logical                           :: is_selfcopy            !! If backend is self-copying
    logical                           :: is_pipelined           !! If backend is pipelined
    integer(int64)                    :: aux_size               !! Number of bytes required by aux buffer
    integer(int64)                    :: send_recv_buffer_size  !! Number of float elements used in ``c_f_pointer``
    TYPE_MPI_COMM                     :: comm                   !! MPI Communicator
    integer(int32),       allocatable :: comm_mapping(:)        !! Mapping of 1d comm ranks to global comm
    integer(int32)                    :: comm_size              !! Size of MPI Comm
    integer(int32)                    :: comm_rank              !! Rank in MPI Comm
    integer(int64),       allocatable :: send_displs(:)         !! Send data displacements, in float elements
    integer(int64),       allocatable :: send_floats(:)         !! Send data elements, in float elements
    integer(int64),       allocatable :: recv_displs(:)         !! Recv data displacements, in float elements
    integer(int64),       allocatable :: recv_floats(:)         !! Recv data elements, in float elements
    ! Self copy params
    type(cudaEvent)                   :: execution_event        !! Event for main execution stream
    type(cudaEvent)                   :: copy_event             !! Event for copy stream
    type(dtfft_stream_t)              :: copy_stream            !! Stream for copy operations
    integer(int64)                    :: self_copy_bytes        !! Number of bytes to copy it itself
    integer(int64)                    :: self_send_displ        !! Displacement for send buffer
    integer(int64)                    :: self_recv_displ        !! Displacement for recv buffer
    ! Pipelined params
    type(nvrtc_kernel),       pointer :: unpack_kernel          !! Kernel for unpacking data
    type(nvrtc_kernel),       pointer :: unpack_kernel2         !! Kernel for unpacking data
  contains
    procedure,            non_overridable,  pass(self)  :: create           !! Creates Abstract GPU Backend
    procedure,            non_overridable,  pass(self)  :: execute          !! Executes GPU Backend
    procedure,            non_overridable,  pass(self)  :: destroy          !! Destroys Abstract GPU Backend
    procedure,            non_overridable,  pass(self)  :: get_aux_size     !! Returns number of bytes required by aux buffer
    procedure,            non_overridable,  pass(self)  :: set_unpack_kernel!! Sets unpack kernel for pipelined backend
    procedure(create_interface),  deferred, pass(self)  :: create_private   !! Creates overring class
    procedure(execute_interface), deferred, pass(self)  :: execute_private  !! Executes GPU Backend
    procedure(destroy_interface), deferred, pass(self)  :: destroy_private  !! Destroys overring class
  end type abstract_backend

  abstract interface
    subroutine create_interface(self, helper, tranpose_type, base_storage)
    !! Creates overring class
    import
      class(abstract_backend),  intent(inout) :: self           !! Abstract GPU Backend
      type(backend_helper),     intent(in)    :: helper         !! Backend helper
      type(dtfft_transpose_t),  intent(in)    :: tranpose_type  !! Type of transpose to create
      integer(int64),           intent(in)    :: base_storage   !! Number of bytes to store single element
    end subroutine create_interface

    subroutine execute_interface(self, in, out, stream, aux)
    !! Executes GPU Backend
    import
      class(abstract_backend),  intent(inout) :: self       !! Abstract GPU Backend
      real(real32),   target,   intent(inout) :: in(:)      !! Send pointer
      real(real32),   target,   intent(inout) :: out(:)     !! Recv pointer
      type(dtfft_stream_t),     intent(in)    :: stream     !! Main execution CUDA stream
      real(real32),   target,   intent(inout) :: aux(:)     !! Aux pointer
    end subroutine execute_interface

    subroutine destroy_interface(self)
    !! Destroys overring class
    import
      class(abstract_backend),    intent(inout) :: self       !! Abstract GPU Backend
    end subroutine destroy_interface
  end interface

contains

  subroutine create(self, backend, tranpose_type, helper, comm_id, send_displs, send_counts, recv_displs, recv_counts, base_storage)
  !! Creates Abstract GPU Backend
    class(abstract_backend),      intent(inout) :: self           !! Abstract GPU Backend
    type(dtfft_backend_t),        intent(in)    :: backend        !! GPU Backend type
    type(dtfft_transpose_t),      intent(in)    :: tranpose_type  !! Type of transpose to create
    type(backend_helper),         intent(in)    :: helper         !! Backend helper
    integer(int8),                intent(in)    :: comm_id        !! Id of communicator to use
    integer(int32),               intent(in)    :: send_displs(:) !! Send data displacements, in original elements
    integer(int32),               intent(in)    :: send_counts(:) !! Send data elements, in float elements
    integer(int32),               intent(in)    :: recv_displs(:) !! Recv data displacements, in float elements
    integer(int32),               intent(in)    :: recv_counts(:) !! Recv data elements, in float elements
    integer(int64),               intent(in)    :: base_storage   !! Number of bytes to store single element
    integer(int64)                            :: send_size      !! Total number of floats to send
    integer(int64)                            :: recv_size      !! Total number of floats to recv
    integer(int32)                            :: ierr           !! MPI Error code
    integer(int64)                            :: scaler         !! Scaling data amount to float size

    scaler = base_storage / FLOAT_STORAGE_SIZE

    send_size = sum(send_counts) * scaler
    recv_size = sum(recv_counts) * scaler
    self%send_recv_buffer_size = max(send_size, recv_size)

    self%comm = helper%comms(comm_id)

    call MPI_Comm_size(self%comm, self%comm_size, ierr)
    call MPI_Comm_rank(self%comm, self%comm_rank, ierr)

    if ( allocated(helper%comm_mappings) ) then
      allocate( self%comm_mapping( 0:self%comm_size - 1 ), source=helper%comm_mappings(0:self%comm_size - 1, comm_id) )
    endif

    allocate( self%send_displs(0:self%comm_size - 1) )
    allocate( self%send_floats(0:self%comm_size - 1) )
    self%send_displs(:) = int(send_displs(:), int64) * scaler
    self%send_displs(:) = self%send_displs(:) + 1
    self%send_floats(:) = int(send_counts(:), int64) * scaler

    allocate( self%recv_displs(0:self%comm_size - 1) )
    allocate( self%recv_floats(0:self%comm_size - 1) )
    self%recv_displs(:) = int(recv_displs(:), int64) * scaler
    self%recv_displs(:) = self%recv_displs(:) + 1
    self%recv_floats(:) = int(recv_counts(:), int64) * scaler

    self%backend = backend
    self%is_pipelined = is_backend_pipelined(backend)
    self%is_selfcopy = self%is_pipelined .or. is_backend_mpi(backend)

    self%aux_size = 0_int64
    if ( self%is_pipelined ) then
      self%aux_size = self%send_recv_buffer_size * FLOAT_STORAGE_SIZE
    endif

    if ( self%is_selfcopy ) then
      self%self_send_displ = self%send_displs(self%comm_rank)
      self%self_recv_displ = self%recv_displs(self%comm_rank)
      self%self_copy_bytes = self%send_floats(self%comm_rank) * FLOAT_STORAGE_SIZE
      self%send_floats(self%comm_rank) = 0
      self%recv_floats(self%comm_rank) = 0

      CUDA_CALL( "cudaEventCreateWithFlags", cudaEventCreateWithFlags(self%execution_event, cudaEventDisableTiming) )
      CUDA_CALL( "cudaEventCreateWithFlags", cudaEventCreateWithFlags(self%copy_event, cudaEventDisableTiming) )
      CUDA_CALL( "cudaStreamCreate", cudaStreamCreate(self%copy_stream) )
    endif

    call self%create_private(helper, tranpose_type, base_storage)
  end subroutine create

  subroutine execute(self, in, out, stream, aux)
  !! Executes GPU Backend
    class(abstract_backend),    intent(inout) :: self     !! Self-copying backend
    real(real32),               intent(inout) :: in(:)    !! Send pointer
    real(real32),               intent(inout) :: out(:)   !! Recv pointer
    type(dtfft_stream_t),       intent(in)    :: stream   !! CUDA stream
    real(real32),               intent(inout) :: aux(:)   !! Aux pointer

    if ( .not. self%is_selfcopy ) then
      call self%execute_private(in, out, stream, aux)
#ifdef DTFFT_DEBUG
      CUDA_CALL( "cudaStreamSynchronize", cudaStreamSynchronize(stream) )
#endif
      return
    endif

    CUDA_CALL( "cudaEventRecord", cudaEventRecord(self%execution_event, stream) )
    ! Waiting for transpose kernel to finish execution on stream `stream`
    CUDA_CALL( "cudaStreamWaitEvent", cudaStreamWaitEvent(self%copy_stream, self%execution_event, 0) )

    if( self%self_copy_bytes > 0 ) then
      if ( self%is_pipelined ) then
        ! Tranposed data is actually located in aux buffer for pipelined algorithm
        CUDA_CALL( "cudaMemcpyAsync", cudaMemcpyAsync(aux( self%self_recv_displ ), in( self%self_send_displ ), self%self_copy_bytes, cudaMemcpyDeviceToDevice, self%copy_stream) )
        ! Data can be unpacked in same stream as `cudaMemcpyAsync`
        call self%unpack_kernel%execute(aux, out, self%copy_stream, self%comm_rank + 1)
      else
        CUDA_CALL( "cudaMemcpyAsync", cudaMemcpyAsync(out( self%self_recv_displ ), in( self%self_send_displ ), self%self_copy_bytes, cudaMemcpyDeviceToDevice, self%copy_stream) )
      endif
    endif
    call self%execute_private(in, out, stream, aux)
    ! Making future events, like FFT, on `stream` to wait for `copy_event`
    CUDA_CALL( "cudaEventRecord", cudaEventRecord(self%copy_event, self%copy_stream) )
    CUDA_CALL( "cudaStreamWaitEvent", cudaStreamWaitEvent(stream, self%copy_event, 0) )
#ifdef DTFFT_DEBUG
      CUDA_CALL( "cudaStreamSynchronize", cudaStreamSynchronize(stream) )
#endif
  end subroutine execute

  subroutine destroy(self)
  !! Destroys Abstract GPU Backend
    class(abstract_backend),    intent(inout) :: self     !! Abstract GPU backend

    if ( allocated( self%send_displs ) ) deallocate( self%send_displs )
    if ( allocated( self%send_floats ) ) deallocate( self%send_floats )
    if ( allocated( self%recv_displs ) ) deallocate( self%recv_displs )
    if ( allocated( self%recv_floats ) ) deallocate( self%recv_floats )
    if ( allocated( self%comm_mapping) ) deallocate( self%comm_mapping)
    self%comm = MPI_COMM_NULL
    if ( self%is_selfcopy ) then
      CUDA_CALL( "cudaEventDestroy", cudaEventDestroy(self%execution_event) )
      CUDA_CALL( "cudaEventDestroy", cudaEventDestroy(self%copy_event) )
      CUDA_CALL( "cudaStreamDestroy", cudaStreamDestroy(self%copy_stream) )
    endif
    if ( self%is_pipelined ) then
      nullify( self%unpack_kernel )
      if ( associated(self%unpack_kernel2) ) nullify( self%unpack_kernel2 )
    endif
    self%is_pipelined = .false.
    self%is_selfcopy = .false.
    call self%destroy_private()
  end subroutine destroy

  integer(int64) function get_aux_size(self)
  !! Returns number of bytes required by aux buffer
    class(abstract_backend),    intent(in)    :: self     !! Abstract GPU backend
    get_aux_size = self%aux_size
  end function get_aux_size

  subroutine set_unpack_kernel(self, unpack_kernel, unpack_kernel2)
  !! Sets unpack kernel for pipelined backend
    class(abstract_backend),    intent(inout)             :: self           !! Pipelined backend
    type(nvrtc_kernel), target, intent(in)                :: unpack_kernel  !! Kernel for unpacking data
    type(nvrtc_kernel), target, intent(in), optional      :: unpack_kernel2  !! Kernel for unpacking data

    self%unpack_kernel => unpack_kernel
    if ( present( unpack_kernel2 ) ) self%unpack_kernel2 => unpack_kernel2
  end subroutine set_unpack_kernel

  subroutine create_helper(self, base_comm, comms, is_nccl_needed, pencils)
  !! Creates helper
    class(backend_helper),  intent(inout) :: self                 !! Backend helper
    TYPE_MPI_COMM,          intent(in)    :: base_comm            !! MPI communicator
    TYPE_MPI_COMM,          intent(in)    :: comms(:)             !! 1D Communicators
    logical,                intent(in)    :: is_nccl_needed       !! If nccl communicator will be needed
    type(pencil), target,   intent(in)    :: pencils(:)           !! Pencils
    integer :: i, n_comms

    call self%destroy()

    self%pencils => pencils(:)

    n_comms = size(comms)
    allocate( self%comms(n_comms) )

    self%comms(1) = base_comm
    do i = 2, n_comms
      self%comms(i) = comms(i)
    enddo
    self%is_nccl_created = .false.
    if ( .not.is_nccl_needed ) return

#ifdef DTFFT_WITH_NCCL
    block
      type(ncclUniqueId)  :: id           ! NCCL unique id
      integer(int32) :: max_size, comm_size, comm_rank, ierr

      max_size = -1
      do i = 1, n_comms
        call MPI_Comm_size(self%comms(i), comm_size, ierr)
        max_size = max(max_size, comm_size)
      enddo
      call MPI_Comm_rank(base_comm, comm_rank, ierr)

      allocate( self%comm_mappings(0:max_size - 1, n_comms), source=-1 )
      do i = 1, n_comms
        call MPI_Allgather(comm_rank, 1, MPI_INTEGER, self%comm_mappings(:, i), 1, MPI_INTEGER, self%comms(i), ierr)
      enddo

      if (comm_rank == 0) then
        NCCL_CALL( "ncclGetUniqueId", ncclGetUniqueId(id) )
      end if
      call MPI_Bcast(id, int(c_sizeof(id)), MPI_BYTE, 0, base_comm, ierr)
      NCCL_CALL( "ncclCommInitRank", ncclCommInitRank(self%nccl_comm, max_size, id, comm_rank) )
      self%is_nccl_created = .true.
    endblock

# ifdef NCCL_HAVE_COMMREGISTER
    self%should_register = get_env("NCCL_BUFFER_REGISTER", .true.)
    if ( self%should_register ) then
      self%nccl_register_size = 0
      allocate( self%nccl_register(2, NCCL_REGISTER_PREALLOC_SIZE) )
      do i = 1, NCCL_REGISTER_PREALLOC_SIZE
        self%nccl_register(1, i) = c_null_ptr
        self%nccl_register(2, i) = c_null_ptr
      enddo
    endif
# endif
#endif
  end subroutine create_helper

  subroutine destroy_helper(self)
  !! Destroys helper
    class(backend_helper),  intent(inout) :: self                 !! Backend helper

    if ( allocated( self%comms ) )          deallocate(self%comms)
    if ( allocated( self%comm_mappings ) )  deallocate(self%comm_mappings)
    nullify( self%pencils )
#ifdef DTFFT_WITH_NCCL
    if ( self%is_nccl_created ) then
      NCCL_CALL( "ncclCommDestroy", ncclCommDestroy(self%nccl_comm) )
    endif
    self%is_nccl_created = .false.
#endif
#ifdef NCCL_HAVE_COMMREGISTER
    if ( self%nccl_register_size > 0 ) then
      WRITE_ERROR("NCCL register is not empty")
    endif
    if ( allocated( self%nccl_register ) ) deallocate(self%nccl_register)
    self%nccl_register_size = 0
#endif
  end subroutine destroy_helper

end module dtfft_abstract_backend
dtfft_abstract_backend.F90 Source File

Contents

Modules

Source Code

This file depends on

Files dependent on this one

Source Code