test_device_kernels.F90

This file depends on

Help
Source Code

program test_device_kernels
use iso_c_binding
use iso_fortran_env
use dtfft_abstract_kernel
use dtfft_interface_cuda_runtime
use dtfft_interface_cuda
use dtfft_interface_nvrtc
use dtfft_config
use dtfft_kernel_device
use dtfft_kernel_host
use dtfft_parameters
use dtfft_utils
#include "_dtfft_cuda.h"
#include "_dtfft_mpi.h"
#include "_dtfft_private.h"
implicit none
integer(int32) :: ierror
type(dtfft_stream_t) :: stream
 
    call MPI_Init(ierror)
    ierror = init_internal()
    ierror = load_cuda()
    ierror = load_nvrtc()

    CUDA_CALL( cudaStreamCreate(stream) )

    call execute_test([18, 155], KERNEL_PERMUTE_FORWARD)
    call execute_test([18, 155], KERNEL_PACK)
    call execute_test([18, 155], KERNEL_UNPACK)
    call execute_test([18, 155], KERNEL_PACK_FORWARD)


    call execute_test([18, 33, 155], KERNEL_PERMUTE_FORWARD)
    call execute_test([18, 33, 155], KERNEL_PERMUTE_BACKWARD)
    call execute_test([18, 33, 155], KERNEL_PERMUTE_BACKWARD_START)
    call execute_test([18, 33, 155], KERNEL_PERMUTE_BACKWARD_END)
    call execute_test([18, 33, 155], KERNEL_PACK)
    call execute_test([18, 33, 155], KERNEL_UNPACK)
    call execute_test([18, 33, 155], KERNEL_PACK_FORWARD)
    call execute_test([18, 33, 155], KERNEL_PACK_BACKWARD)

    CUDA_CALL( cudaStreamDestroy(stream) )

    call MPI_Finalize(ierror)
contains

    subroutine execute_test(dims, kernel_type)
        integer(int32),         intent(in) :: dims(:)
        type(kernel_type_t),    intent(in) :: kernel_type
        type(c_ptr) :: p1, p2, p2_host, p3, gold
        integer(int32) :: locals(5, 1)
        type(kernel_device) :: dev
        type(kernel_host) :: host
        real(real32), pointer, contiguous :: p2_host_(:), p3_(:), gold_(:)
        integer(int32) :: i
        real(real32) :: rnd
        type(string) :: kernel_string

        locals(1:size(dims), 1) = dims(:)
        if ( size(dims) == 2 ) locals(3, 1) = 1
        locals(4, 1) = 0
        locals(5, 1) = 0

        kernel_string = get_kernel_string(kernel_type)
        print*,'Testing kernel '//kernel_string%raw//', ndims = '//to_str(size(dims))

        call dev%create(dims, DTFFT_ESTIMATE, FLOAT_STORAGE_SIZE, kernel_type, locals)
        call host%create(dims, DTFFT_ESTIMATE, FLOAT_STORAGE_SIZE, kernel_type, locals)

        CUDA_CALL( cudaMalloc(p1, FLOAT_STORAGE_SIZE * product(dims)) )
        CUDA_CALL( cudaMalloc(p2, FLOAT_STORAGE_SIZE * product(dims)) )
        p2_host = mem_alloc_host(FLOAT_STORAGE_SIZE * product(dims))
        p3 = mem_alloc_host(FLOAT_STORAGE_SIZE * product(dims))
        gold = mem_alloc_host(FLOAT_STORAGE_SIZE * product(dims))

        call c_f_pointer(p3, p3_, [product(dims)])
        call c_f_pointer(gold, gold_, [product(dims)])
        do i = 1, product(dims)
            call random_number(rnd)
            p3_(i) = rnd * real(i, real32)
        enddo

        CUDA_CALL( cudaMemcpy(p1, p3, FLOAT_STORAGE_SIZE * product(dims), cudaMemcpyHostToDevice) )
        CUDA_CALL( cudaDeviceSynchronize() )

        call host%execute(p3, gold, stream, 1)
        call dev%execute(p1, p2, stream, 1)

        CUDA_CALL( cudaDeviceSynchronize() )
        CUDA_CALL( cudaMemcpy(p2_host, p2, FLOAT_STORAGE_SIZE * product(dims), cudaMemcpyDeviceToHost) )

        call c_f_pointer(p2_host, p2_host_, [product(dims)])
        do i = 1, product(dims)
            if (abs(p2_host_(i) - gold_(i)) > 1e-6) then
                print*,'i = ',i, 'gold = ',gold_(i), 'dev = ',p2_host_(i)
                INTERNAL_ERROR("Mismatch between device and host results")
            end if
        enddo
        call mem_free_host(p2_host)
        call mem_free_host(p3)
        call mem_free_host(gold)
        CUDA_CALL( cudaFree(p1) )
        CUDA_CALL( cudaFree(p2) )
        call dev%destroy()
        call host%destroy()
        print*,'SUCCESS'
    end subroutine execute_test
end program test_device_kernels
test_device_kernels.F90 Source File

Contents

Programs

Source Code

This file depends on

Source Code