Procedures

ProcedureLocationProcedure TypeDescription
add dtfft_nvrtc_module_cache Subroutine

Adds new entry to cache

add_line dtfft_nvrtc_module Subroutine

Adds new line to CUDA code

aligned_alloc dtfft_utils Interface
alloc_and_set_aux dtfft_transpose_plan Subroutine

Allocates auxiliary memory according to the backend and sets it to the plans

alloc_fft_plans dtfft_plan Subroutine

Allocates abstract_executor with required FFT class and populates fft_mapping with similar FFT ids

alloc_mem dtfft_transpose_plan Subroutine

Allocates memory based on backend

allocate_plans dtfft_transpose_plan Subroutine

Allocates array of plans

astring_f2c dtfft_utils Subroutine

Convert Fortran string to C allocatable string

autotune_grid dtfft_transpose_plan Subroutine

Creates cartesian grid and runs various backends on it. Returns best backend and execution time

autotune_grid_decomposition dtfft_transpose_plan Subroutine

Runs through all possible grid decompositions and selects the best one based on the lowest average execution time

autotune_transpose_id dtfft_transpose_plan Function

Creates forward and backward transpose plans for backend DTFFT_BACKEND_MPI_DATATYPE based on source and target data distributions and, executes them DTFFT_MEASURE_ITERS times ( 4 * DTFFT_MEASURE_ITERS iterations total ) + 4 * DTFFT_MEASURE_WARMUP_ITERS warmup iterations

Read more…
backend_eq dtfft_parameters Function
backend_ne dtfft_parameters Function
check_aux dtfft_plan Subroutine

Checks if aux buffer was passed by user and if not will allocate one internally

check_continuity dtfft_pencil Function

Check if the local pencils cover the global space without gaps

check_create_args dtfft_plan Function

Check arguments provided by user and sets private variables

check_device_pointers dtfft_plan Function

Checks if device pointers are provided by user

check_if_even dtfft_pencil Function

Checks if data is evenly distributed across processes

check_if_overflow dtfft_transpose_handle_generic Subroutine

Checks if product of sizes fits into integer(int32)

check_instance dtfft_nvrtc_module Function

Checks if kernel with given parameters is available in this module

check_module dtfft_nvrtc_module Function

Basic check that this module provides kernels of given type

check_overlap dtfft_pencil Function

Check if two pencols overlap in ndims-dimensional space

Comm_f2c dtfft_utils Interface
compare test_host_kernels Subroutine
compile_program dtfft_nvrtc_module Function

Compiles nvRTC program with given configurations

compute_alltoall_schedule dtfft_backend_mpi Subroutine

Generate optimal round-robin communication schedule for all-to-all pattern

config_constructor dtfft_config Function

Creates a new configuration

count_bank_conflicts dtfft_nvrtc_block_optimizer Function

Counts bank conflicts for a given tile size, padding, element size, and block rows.

count_unique dtfft_utils Function

Count the number of unique elements in the array

create dtfft_transpose_plan Function

Creates transposition plan

create dtfft_abstract_kernel Subroutine

Creates kernel

create dtfft_transpose_handle_generic Subroutine

Creates Generic Transpose Handle

create dtfft_backend_cufftmp_m Subroutine

Creates cuFFTMp GPU Backend

create dtfft_pencil Subroutine

Creates pencil

create dtfft_abstract_executor Function

Creates FFT plan

create dtfft_executor_fftw_m Subroutine

Creates FFT plan via FFTW3 Interface

create dtfft_kernel_device Subroutine

Creates kernel

create dtfft_nvrtc_module_cache Subroutine

Creates cache

create dtfft_abstract_transpose_handle Subroutine

Creates transpose handle

create dtfft_executor_mkl_m Subroutine

Creates FFT plan via MKL DFTI Interface

create dtfft_executor_cufft_m Subroutine

Creates FFT plan via cuFFT Interface

create dtfft_nvrtc_module Subroutine

Creates module with given parameters, compiles nvRTC program and loads it as CUDA module

create dtfft_executor_vkfft_m Subroutine

Creates FFT plan via vkFFT Interface

create dtfft_abstract_backend Subroutine

Creates Abstract Backend

create dtfft_transpose_handle_datatype Subroutine

Creates transpose_handle_datatype class

create_1d_comm dtfft_pencil Subroutine

Creates a new 1D communicator based on the fixed dimensions of the current pencil

create_back_permutation dtfft_transpose_handle_datatype Subroutine

Creates three-dimensional Y –> X and Z –> Y transposition datatypes

create_c2c dtfft_plan Subroutine

C2C Plan Constructor

create_c2c_core dtfft_plan Function

Creates plan for both C2C and R2C

create_c2c_internal dtfft_plan Function

Private method that combines common logic for C2C plan creation

create_c2c_pencil dtfft_plan Subroutine

C2C Plan Constructor

create_cart_comm dtfft_transpose_plan Subroutine

Creates cartesian communicator

create_data_handle dtfft_transpose_handle_generic Subroutine

Creates handle

create_forw_permutation dtfft_transpose_handle_datatype Subroutine

Creates three-dimensional X –> Y and Y -> Z transposition datatypes

create_handle dtfft_transpose_handle_datatype Subroutine

Creates transposition handle

create_helper dtfft_backend_mpi Subroutine

Creates MPI helper

create_helper dtfft_abstract_backend Subroutine

Creates helper

create_host dtfft_kernel_host Subroutine

Creates host kernel

create_mpi dtfft_backend_mpi Subroutine

Creates MPI backend

create_nccl dtfft_backend_nccl_m Subroutine

Creates NCCL backend

create_nvrtc_module dtfft_nvrtc_module_cache Subroutine

Creates and adds a new nvrtc module to the cache if it does not already exist

create_nvtx_domain dtfft_interface_nvtx Subroutine

Creates a new NVTX domain

create_pencil_init dtfft_pencil Function

Creates and validates pencil passed by user to plan constructors

create_pencil_t dtfft_pencil Function

Creates pencil object, that can be used to create dtFFT plans

create_pencils_and_comm dtfft_transpose_plan Subroutine

Creates cartesian communicator

create_private dtfft_plan Function

Creates core

create_r2c dtfft_plan Subroutine

R2C Generic Plan Constructor

create_r2c_internal dtfft_plan Function

Private method that combines common logic for R2C plan creation

create_r2c_pencil dtfft_plan Subroutine

R2C Plan Constructor with pencil

create_r2r dtfft_plan Subroutine

R2R Plan Constructor

create_r2r_internal dtfft_plan Function

Creates plan for R2R plans

create_r2r_pencil dtfft_plan Subroutine

R2R Plan Constructor

create_subcomm dtfft_utils Subroutine

Creates communicator with selected processes from old_comm

create_subcomm_include_all dtfft_utils Subroutine

Creates communicator including all processes from old_comm

create_transpose_2d dtfft_transpose_handle_datatype Subroutine

Creates two-dimensional transposition datatypes

create_transpose_XZ dtfft_transpose_handle_datatype Subroutine

Creates three-dimensional X –> Z transposition datatypes Can only be used with 3D slab decomposition when slabs are distributed in Z direction

create_transpose_ZX dtfft_transpose_handle_datatype Subroutine

Creates three-dimensional Z –> X transposition datatypes Can only be used with 3D slab decomposition when slabs are distributed in Z direction

cudaDeviceSynchronize dtfft_interface_cuda_runtime Interface
cudaEventCreate dtfft_interface_cuda_runtime Interface
cudaEventCreateWithFlags dtfft_interface_cuda_runtime Interface
cudaEventDestroy dtfft_interface_cuda_runtime Interface
cudaEventElapsedTime dtfft_interface_cuda_runtime Interface
cudaEventRecord dtfft_interface_cuda_runtime Interface
cudaEventSynchronize dtfft_interface_cuda_runtime Interface
cudaFree dtfft_interface_cuda_runtime Interface
cudaGetDevice dtfft_interface_cuda_runtime Interface
cudaGetDeviceCount dtfft_interface_cuda_runtime Interface
cudaGetErrorString dtfft_interface_cuda_runtime Function

Helper function that returns a string describing the given nvrtcResult code If the error code is not recognized, “unrecognized error code” is returned.

cudaGetErrorString_c dtfft_interface_cuda_runtime Interface
cudaGetLastError dtfft_interface_cuda_runtime Interface
cudaMalloc dtfft_interface_cuda_runtime Interface
cudaMemcpy dtfft_interface_cuda_runtime Interface

Copies data synchronously between host and device.

cudaMemcpyAsync dtfft_interface_cuda_runtime Interface

Copies data asynchronously between host and device.

cudaMemGetInfo dtfft_interface_cuda_runtime Interface
cudaMemset dtfft_interface_cuda_runtime Interface
cudaSetDevice dtfft_interface_cuda_runtime Interface
cudaStreamCreate dtfft_interface_cuda_runtime Interface
cudaStreamDestroy dtfft_interface_cuda_runtime Interface
cudaStreamQuery dtfft_interface_cuda_runtime Interface
cudaStreamSynchronize dtfft_interface_cuda_runtime Interface
cudaStreamWaitEvent dtfft_interface_cuda_runtime Interface
cufftDestroy dtfft_interface_cufft Interface

Frees all GPU resources associated with a cuFFT plan and destroys the internal plan data structure.

cufftGetErrorString dtfft_interface_cufft Function

Returns a string representation of the cuFFT error code.

cufftMpAttachReshapeComm dtfft_interface_cufft Interface

Attaches a communication handle to a reshape. This function is not collective.

cufftMpCreateReshape dtfft_interface_cufft Interface

Initializes a reshape handle for future use. This function is not collective.

cufftMpDestroyReshape dtfft_interface_cufft Interface

Destroys a reshape and all its associated data.

cufftMpExecReshapeAsync dtfft_interface_cufft Interface

Executes the reshape, redistributing data_in into data_out using the workspace in workspace.

cufftMpGetReshapeSize dtfft_interface_cufft Interface

Returns the amount (in bytes) of workspace required to execute the handle.

cufftMpMakeReshape dtfft_interface_cufft Interface

Creates a reshape intended to re-distribute a global array of 3D data.

cufftPlanMany dtfft_interface_cufft Interface

Creates a FFT plan configuration of dimension rank, with sizes specified in the array n.

cufftSetStream dtfft_interface_cufft Interface

Associates a CUDA stream with a cuFFT plan.

cufftXtExec dtfft_interface_cufft Interface

Executes any cuFFT transform regardless of precision and type. In case of complex-to-real and real-to-complex transforms, the direction parameter is ignored.

cuLaunchKernel dtfft_interface_cuda Function

Launches a CUDA kernel

destroy dtfft_transpose_plan Subroutine

Destroys transposition plans

destroy dtfft_abstract_kernel Subroutine

Destroys kernel

destroy dtfft_transpose_handle_generic Subroutine

Destroys Generic Transpose Handle

destroy dtfft_backend_cufftmp_m Subroutine

Destroys cuFFTMp GPU Backend

destroy dtfft_pencil Subroutine

Destroys pencil

destroy dtfft_abstract_executor Subroutine

Destroys plan

destroy dtfft_executor_fftw_m Subroutine

Destroys FFTW3 plan

destroy dtfft_kernel_device Subroutine

Destroys kernel

destroy dtfft_executor_mkl_m Subroutine

Destroys MKL plan

destroy dtfft_executor_cufft_m Subroutine

Destroys cuFFT plan

destroy dtfft_nvrtc_module Subroutine

Destroys module and frees resources

destroy dtfft_executor_vkfft_m Subroutine

Destroys vkFFT plan

destroy dtfft_plan Subroutine

Destroys plan, frees all memory

destroy dtfft_abstract_backend Subroutine

Destroys Abstract Backend

destroy dtfft_transpose_handle_datatype Subroutine

Destroys transpose_handle_datatype class

destroy_data_handle dtfft_transpose_handle_generic Subroutine

Destroys handle

destroy_handle dtfft_transpose_handle_datatype Subroutine

Destroys transposition handle

destroy_helper dtfft_backend_mpi Subroutine

Destroys MPI helper

destroy_helper dtfft_abstract_backend Subroutine

Destroys helper

destroy_host dtfft_kernel_host Subroutine

Destroys host kernel

destroy_mpi dtfft_backend_mpi Subroutine

Destroys MPI backend

destroy_nccl dtfft_backend_nccl_m Subroutine

Destroys NCCL backend

destroy_pencil_init dtfft_pencil Subroutine

Destroys pencil_init

destroy_pencil_t dtfft_pencil Subroutine

Destroys pencil

destroy_pencil_t_private dtfft_pencil Subroutine

Destroys pencil

destroy_plans dtfft_transpose_plan Subroutine

Destroys array of plans

destroy_stream dtfft_config Subroutine

Destroy the default stream if it was created

destroy_string dtfft_utils Subroutine
destroy_strings dtfft_utils Subroutine

Destroys array of string objects

DftiErrorMessage dtfft_interface_mkl_m Function

Generates an error message.

DftiErrorMessage_c dtfft_interface_mkl_m Interface
dl_error dtfft_utils Subroutine

Writes error message to the error unit

dlclose dtfft_utils Interface
dlerror dtfft_utils Interface
dlopen dtfft_utils Interface
dlsym dtfft_utils Interface
double_to_string dtfft_utils Function

Convert double to string

dtfft_config_t dtfft_config Interface

Interface to create a new configuration

dtfft_create_config dtfft_config Subroutine

Creates a new configuration and sets default values.

Read more…
dtfft_create_plan_c2c_c dtfft_api Function

Creates C2C dtFFT Plan, allocates all structures and prepares FFT, C interface

dtfft_create_plan_c2c_pencil_c dtfft_api Function

Creates C2C dtFFT plan from Pencil, allocates all structures and prepares FFT, C interface

dtfft_create_plan_r2r_c dtfft_api Function

Creates R2R dtFFT Plan, allocates all structures and prepares FFT, C interface

dtfft_create_plan_r2r_pencil_c dtfft_api Function

Creates R2R dtFFT Plan from Pencil, allocates all structures and prepares FFT, C interface

dtfft_destroy_c dtfft_api Function

Destroys dtFFT Plan, C interface

dtfft_execute_c dtfft_api Function

Executes dtFFT Plan, C interface. aux can be NULL.

dtfft_get_alloc_bytes_c dtfft_api Function

Returns minimum number of bytes required to execute plan, C interface

dtfft_get_alloc_size_c dtfft_api Function

Returns minimum number of bytes to be allocated for in and out buffers, C interface

dtfft_get_backend_c dtfft_api Function

Returns selected dtfft_backend_t during autotuning

dtfft_get_backend_string dtfft_parameters Function

Gets the string description of a backend

dtfft_get_backend_string_c dtfft_api Subroutine

Returns string representation of dtfft_backend_t

dtfft_get_cuda_stream dtfft_parameters Function

Returns the CUDA stream from dtfft_stream_t

dtfft_get_dims_c dtfft_api Function

Returns dimensions of plan, C interface

dtfft_get_element_size_c dtfft_api Function

Returns size of element in bytes, C interface

dtfft_get_error_string dtfft_errors Function

Gets the string description of an error code

dtfft_get_error_string_c dtfft_api Subroutine

Returns an explaination of error_code that could have been previously returned by one of dtFFT API calls, C interface

dtfft_get_executor_c dtfft_api Function

Returns executor type used in plan, C interface

dtfft_get_executor_string dtfft_parameters Function

Gets the string description of an executor

dtfft_get_executor_string_c dtfft_api Subroutine

Returns string representation of dtfft_executor_t, C interface

dtfft_get_grid_dims_c dtfft_api Function

Returns grid decomposition dimensions of plan, C interface

dtfft_get_local_sizes_c dtfft_api Function

Returns local sizes, counts in real and Fourier spaces and number of elements to be allocated for in and out buffers, C interface.

dtfft_get_pencil_c dtfft_api Function

Returns pencil decomposition info, C interface

dtfft_get_platform_c dtfft_api Function

Returns selected dtfft_platform_t during autotuning

dtfft_get_precision_c dtfft_api Function

Returns precision used in plan, C interface

dtfft_get_precision_string dtfft_parameters Function

Gets the string description of a precision

dtfft_get_precision_string_c dtfft_api Subroutine

Returns string representation of dtfft_precision_t, C interface

dtfft_get_stream_c dtfft_api Function

Returns Stream associated with plan

dtfft_get_version dtfft_parameters Interface

Get dtFFT version

dtfft_get_version_current dtfft_parameters Function

Returns the current version code

dtfft_get_version_required dtfft_parameters Function

Returns the version code required by the user

dtfft_get_y_slab_enabled_c dtfft_api Function

Checks if dtFFT Plan is using Y-slab optimization

dtfft_get_z_slab_enabled_c dtfft_api Function

Checks if dtFFT Plan is using Z-slab optimization

dtfft_mem_alloc_c dtfft_api Function

Allocates memory for dtFFT Plan, C interface

dtfft_mem_free_c dtfft_api Function

Frees memory for dtFFT Plan, C interface

dtfft_pencil_t dtfft_pencil Interface

Type bound constuctor for dtfft_pencil_t

dtfft_report_c dtfft_api Function

Reports dtFFT Plan, C interface

dtfft_set_config dtfft_config Subroutine

Sets configuration parameters

dtfft_set_config_c dtfft_api Function

Sets dtFFT configuration, C interface

dtfft_stream_t dtfft_parameters Interface

Creates dtfft_stream_t from integer(cuda_stream_kind)

dtfft_transpose_c dtfft_api Function

Executes single transposition, C interface.

dtfft_transpose_end_c dtfft_api Function

Finishes asynchronous transposition, C interface.

dtfft_transpose_start_c dtfft_api Function

Starts asynchronous transposition, returns transpose handle, C interface.

dynamic_load dtfft_utils Function

Dynamically loads library and its symbols

effort_eq dtfft_parameters Function
effort_ne dtfft_parameters Function
estimate_bank_conflict_ratio dtfft_nvrtc_block_optimizer Function

Estimates the bank conflict ratio for a given kernel configuration

estimate_coalescing dtfft_nvrtc_block_optimizer Function

Estimate memory coalescing efficiency for a given kernel configuration and transpose type

estimate_memory_pressure dtfft_nvrtc_block_optimizer Function

Analytical estimation of memory pressure based on GPU architecture

estimate_occupancy dtfft_nvrtc_block_optimizer Function

Calculates theoretical occupancy for a given kernel configuration

estimate_optimal_padding dtfft_nvrtc_block_optimizer Function

Estimates the optimal padding for a given tile size and element size

evaluate_analytical_performance dtfft_nvrtc_block_optimizer Function

This function evaluates the performance of a kernel configuration based on various architectural and problem-specific parameters.

exec_eq dtfft_parameters Function
execute dtfft_transpose_plan Subroutine

Executes transposition

execute dtfft_abstract_kernel Subroutine

Executes kernel

execute dtfft_transpose_handle_generic Subroutine

Executes transpose - exchange - unpack

execute dtfft_backend_cufftmp_m Subroutine

Executes cuFFTMp GPU Backend

execute dtfft_abstract_executor Subroutine

Executes plan

execute dtfft_executor_fftw_m Subroutine

Executes FFTW3 plan

execute dtfft_kernel_device Subroutine

Executes kernel on stream

execute dtfft_executor_mkl_m Subroutine

Executes MKL plan

execute dtfft_executor_cufft_m Subroutine

Executes cuFFT plan

execute dtfft_executor_vkfft_m Subroutine

Executes vkFFT plan

execute dtfft_plan Subroutine

Executes plan

execute dtfft_abstract_backend Subroutine

Executes Backend

execute dtfft_transpose_handle_datatype Subroutine

Executes transposition

execute_2d dtfft_plan Subroutine

Executes plan with specified auxiliary buffer

execute_a2a dtfft_backend_mpi Subroutine
execute_benchmark dtfft_kernel_host Subroutine

Executes benchmark for the given kernel

execute_end dtfft_transpose_plan Subroutine

Finishes asynchronous transposition

execute_end dtfft_transpose_handle_generic Subroutine

Ends execution of transposition

execute_end dtfft_abstract_backend Subroutine

Ends execution of Backend

execute_end dtfft_transpose_handle_datatype Subroutine

Ends execution of transposition

execute_end_mpi dtfft_backend_mpi Subroutine
execute_f128 dtfft_kernel_host Subroutine

Executes kernel based on its type and access mode, complex(real64) version

execute_f128_block_16 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f128_block_32 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f128_block_64 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f32 dtfft_kernel_host Subroutine

Executes kernel based on its type and access mode, real(real32) version

execute_f32_block_16 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f32_block_32 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f32_block_64 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f64 dtfft_kernel_host Subroutine

Executes kernel based on its type and access mode, real(real64) version

execute_f64_block_16 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f64_block_32 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_f64_block_64 dtfft_kernel_host Subroutine

Executes the given kernel on host

execute_generic dtfft_plan Subroutine

Executes plan with specified auxiliary buffer

execute_host dtfft_kernel_host Subroutine

Executes host kernel

execute_mpi dtfft_backend_mpi Subroutine

Executes MPI backend

execute_nccl dtfft_backend_nccl_m Subroutine

Executes NCCL backend

execute_p2p dtfft_backend_mpi Subroutine
execute_p2p_scheduled dtfft_backend_mpi Subroutine
execute_private dtfft_plan Subroutine

Executes plan with specified auxiliary buffer

execute_ptr dtfft_plan Subroutine

Executes plan using type(c_ptr) pointers instead of buffers

execute_self_copy dtfft_abstract_backend Subroutine
execute_type_eq dtfft_parameters Function
execute_type_ne dtfft_parameters Function
execute_z_slab dtfft_plan Subroutine

Executes plan with specified auxiliary buffer

executor_eq dtfft_parameters Function
executor_ne dtfft_parameters Function
fftw_execute_dft dtfft_interface_fftw_m Interface
fftw_execute_dft_c2r dtfft_interface_fftw_m Interface
fftw_execute_dft_r2c dtfft_interface_fftw_m Interface
fftw_execute_r2r dtfft_interface_fftw_m Interface
fftw_plan_many_dft dtfft_interface_fftw_m Interface
fftw_plan_many_dft_c2r dtfft_interface_fftw_m Interface
fftw_plan_many_dft_r2c dtfft_interface_fftw_m Interface
fftw_plan_many_r2r dtfft_interface_fftw_m Interface
fftwf_execute_dft dtfft_interface_fftw_m Interface
fftwf_execute_dft_c2r dtfft_interface_fftw_m Interface
fftwf_execute_dft_r2c dtfft_interface_fftw_m Interface
fftwf_execute_r2r dtfft_interface_fftw_m Interface
fftwf_plan_many_dft dtfft_interface_fftw_m Interface
fftwf_plan_many_dft_c2r dtfft_interface_fftw_m Interface
fftwf_plan_many_dft_r2c dtfft_interface_fftw_m Interface
fftwf_plan_many_r2r dtfft_interface_fftw_m Interface
find_valid_combination dtfft_nvrtc_block_optimizer Subroutine

This subroutine optimizes the tile size and number of rows for narrow matrices by adjusting them to be compatible with the warp size.

float_to_string dtfft_utils Function

Convert double to string

free_datatypes dtfft_transpose_handle_datatype Subroutine

Frees temporary datatypes

free_mem dtfft_transpose_plan Subroutine

Frees memory based on backend

generate_candidates dtfft_nvrtc_block_optimizer Subroutine

Generate kernel configuration candidates for given problem

get dtfft_nvrtc_module Function

Returns kernel ready to be executed

get_alloc_bytes dtfft_plan Function

Returns minimum number of bytes required to execute plan

get_alloc_size dtfft_plan Function

Wrapper around get_local_sizes to obtain number of elements only

get_ampere_architecture dtfft_nvrtc_block_optimizer Function

Ampere architecture (Compute Capability 8.0)

get_async_active dtfft_transpose_plan Function

Returns .true. if any of the plans is running asynchronously

get_async_active dtfft_transpose_handle_generic Function
get_async_active dtfft_backend_mpi Function

Returns if async transpose is active

get_async_active dtfft_abstract_backend Function

Returns if async execution is active

get_async_active dtfft_transpose_handle_datatype Function

Returns if async transpose is active

get_aux_size dtfft_transpose_plan Function

Returns maximum auxiliary memory size needed by transpose plan

get_aux_size dtfft_transpose_handle_generic Function

Returns number of bytes required by aux buffer

get_aux_size dtfft_abstract_transpose_handle Function

Returns number of bytes required by aux buffer

get_aux_size dtfft_abstract_backend Function

Returns number of bytes required by aux buffer

get_aux_size_generic dtfft_transpose_plan Function

Returns maximum auxiliary memory size needed by plans

get_backend dtfft_transpose_plan Function

Returns plan GPU backend

get_backend dtfft_plan Function

Returns selected GPU backend during autotuning

get_code dtfft_nvrtc_module Function

Generates code that will be used to locally tranpose data and prepares to send it to other processes

get_comm dtfft_api Function

Converts C communicator to Fortran communicator

get_conf_backend dtfft_config Function

Returns backend set by the user or default one

get_conf_configs_to_test dtfft_config Function

Returns the number of configurations to test

get_conf_datatype_enabled dtfft_config Function

Whether MPI Datatype backend is enabled or not

get_conf_forced_kernel_optimization dtfft_config Function

Whether forced kernel optimization is enabled or not

get_conf_internal dtfft_config Interface

Returns value from configuration unless environment variable is set

get_conf_internal_int32 dtfft_config Function

Returns value from configuration unless environment variable is set

get_conf_internal_logical dtfft_config Function

Returns value from configuration unless environment variable is set

get_conf_kernel_optimization_enabled dtfft_config Function

Whether kernel optimization is enabled or not

get_conf_log_enabled dtfft_config Function

Whether logging is enabled or not

get_conf_measure_iters dtfft_config Function

Returns the number of measurement iterations

get_conf_measure_warmup_iters dtfft_config Function

Returns the number of warmup iterations

get_conf_mpi_enabled dtfft_config Function

Whether MPI backends are enabled or not

get_conf_nccl_enabled dtfft_config Function

Whether NCCL backends are enabled or not

get_conf_nvshmem_enabled dtfft_config Function

Whether nvshmem backends are enabled or not

get_conf_pipelined_enabled dtfft_config Function

Whether pipelined backends are enabled or not

get_conf_platform dtfft_config Function

Returns platform set by the user or default one

get_conf_stream dtfft_config Function

Returns either the custom provided by user or creates a new one

get_conf_y_slab_enabled dtfft_config Function

Whether Y-slab optimization is enabled or not

get_conf_z_slab_enabled dtfft_config Function

Whether Z-slab optimization is enabled or not

get_correct_backend dtfft_config Function
get_datatype_from_env dtfft_config Function

Obtains datatype id from environment variable

get_device_props dtfft_interface_cuda_runtime Interface
get_dims dtfft_plan Subroutine

Returns global dimensions

get_element_size dtfft_plan Function

Returns number of bytes required to store single element.

get_env dtfft_config Interface

Obtains environment variable

get_env_base dtfft_config Function

Base function of obtaining dtFFT environment variable

get_env_int32 dtfft_config Function

Base Integer function of obtaining dtFFT environment variable

get_env_int8 dtfft_config Function

Obtains int8 environment variable

get_env_logical dtfft_config Function

Obtains logical environment variable

get_env_string dtfft_config Function

Obtains string environment variable

get_executor dtfft_plan Function

Returns FFT Executor associated with plan

get_grid_dims dtfft_plan Subroutine

Returns grid decomposition dimensions

get_host_kernel_string dtfft_kernel_host Function

Returns string representation of the given host kernel type

get_inverse_kind dtfft_utils Function

Get the inverse R2R kind of transform for the given R2R kind

get_kernel dtfft_kernel_device Subroutine

Compiles kernel and caches it. Returns compiled kernel.

get_kernel_args dtfft_kernel_device Subroutine

Populates kernel arguments based on kernel type

get_kernel_instance dtfft_nvrtc_module_cache Function

Retrieves a kernel instance from the cache If the instance is not found, an error is raised

get_kernel_launch_params dtfft_kernel_device Subroutine

Computes kernel launch parameters based on kernel type and dimensions

get_kernel_string dtfft_abstract_kernel Function

Gets the string description of a kernel

get_local_size dtfft_pencil Subroutine

Computes local portions of data based on global count and position inside grid communicator

get_local_sizes dtfft_pencil Subroutine

Obtain local starts and counts in real and fourier spaces

get_local_sizes dtfft_plan Subroutine

Obtain local starts and counts in real and fourier spaces

get_mangled_name dtfft_nvrtc_module Function

Gets mangled name for given template parameters from nvRTC program

get_name_expression dtfft_nvrtc_module Function

Generates name expression for given template parameters

get_pencil dtfft_plan Function

Returns pencil decomposition

get_plan_execution_time dtfft_transpose_plan Function

Creates transpose plan for backend DTFFT_BACKEND_MPI_DATATYPE and executes it DTFFT_MEASURE_WARMUP_ITERS + DTFFT_MEASURE_ITERS times

Read more…
get_platform dtfft_plan Function

Returns execution platform of the plan (HOST or CUDA)

get_precision dtfft_plan Function

Returns precision of the plan

get_stream_int64 dtfft_plan Subroutine

Returns CUDA stream associated with plan

get_stream_ptr dtfft_plan Subroutine

Returns CUDA stream associated with plan

get_transpose_type dtfft_pencil Function

Determines transpose ID based on pencils

get_varying_dim dtfft_pencil Function
get_volta_architecture dtfft_nvrtc_block_optimizer Function

Volta architecture (Compute Capability 7.0)

get_y_slab_enabled dtfft_plan Function

Returns logical value is Y-slab optimization enabled internally

get_z_slab dtfft_transpose_plan Function

Returns .true. if Z-slab optimization is enabled

get_z_slab_enabled dtfft_plan Function

Returns logical value is Z-slab optimization enabled internally

host_kernel_eq dtfft_kernel_host Function
init_environment dtfft_config Subroutine
init_internal dtfft_config Function

Checks if MPI is initialized and loads environment variables

init_nvshmem dtfft_interface_nvshmem Interface
int32_to_string dtfft_utils Function

Convert 32-bit integer to string

int64_to_string dtfft_utils Function

Convert 64-bit integer to string

int8_to_string dtfft_utils Function

Convert 8-bit integer to string

is_backend_cufftmp dtfft_parameters Function
is_backend_mpi dtfft_parameters Function
is_backend_nccl dtfft_parameters Function
is_backend_nvshmem dtfft_parameters Function
is_backend_pipelined dtfft_parameters Function
is_cuda_executor dtfft_parameters Function
is_device_ptr dtfft_utils Interface
is_host_executor dtfft_parameters Function
is_null_funptr dtfft_utils Function

Checks if pointer is NULL

is_null_ptr dtfft_utils Function

Checks if pointer is NULL

is_null_ptr dtfft_utils Interface

Checks if pointer is NULL

is_nvshmem_ptr dtfft_interface_nvshmem Function

Checks if pointer is a symmetric nvshmem allocated pointer

is_same_ptr dtfft_utils Function

Checks if two pointer are the same

is_transpose_kernel dtfft_abstract_kernel Function
is_unpack_kernel dtfft_abstract_kernel Function
is_valid_backend dtfft_parameters Function
is_valid_comm_type dtfft_parameters Function
is_valid_dimension dtfft_parameters Function
is_valid_effort dtfft_parameters Function
is_valid_execute_type dtfft_parameters Function
is_valid_executor dtfft_parameters Function
is_valid_platform dtfft_parameters Function
is_valid_precision dtfft_parameters Function
is_valid_r2r_kind dtfft_parameters Function
is_valid_transpose_type dtfft_parameters Function
kernel_type_eq dtfft_abstract_kernel Function
kernel_type_ne dtfft_abstract_kernel Function
load dtfft_interface_vkfft_m Function

Loads VkFFT library

load_cuda dtfft_interface_cuda Function

Loads the CUDA Driver library and needed symbols

load_library dtfft_utils Function

Dynamically loads library

load_nvrtc dtfft_interface_nvrtc Function

Dynamically loads nvRTC library and its functions

load_symbol dtfft_utils Function

Dynamically loads symbol from library

load_vkfft dtfft_interface_vkfft_m Function

Loads VkFFT library based on the platform

make_plan dtfft_executor_mkl_m Subroutine

Creates general MKL plan

make_public dtfft_pencil Function

Creates public object that users can use to create own FFT backends

mem_alloc dtfft_transpose_plan Subroutine

Allocates memory based on selected backend

mem_alloc dtfft_executor_fftw_m Subroutine

Allocates FFTW3 memory

mem_alloc dtfft_executor_mkl_m Subroutine

Allocates MKL memory

mem_alloc dtfft_executor_cufft_m Subroutine

Dummy method. Raises error stop

mem_alloc dtfft_executor_vkfft_m Subroutine

Dummy method. Raises error stop

mem_alloc_c32_1d dtfft_plan Subroutine

Allocates pointer of rank 1

mem_alloc_c32_2d dtfft_plan Subroutine

Allocates pointer of rank 2

mem_alloc_c32_3d dtfft_plan Subroutine

Allocates pointer of rank 3

mem_alloc_c64_1d dtfft_plan Subroutine

Allocates pointer of rank 1

mem_alloc_c64_2d dtfft_plan Subroutine

Allocates pointer of rank 2

mem_alloc_c64_3d dtfft_plan Subroutine

Allocates pointer of rank 3

mem_alloc_host dtfft_utils Function

Allocates memory using C11 Standard alloc_align with 16 bytes alignment

mem_alloc_ptr dtfft_plan Function

Allocates memory specific for this plan

mem_alloc_r32_1d dtfft_plan Subroutine

Allocates pointer of rank 1

mem_alloc_r32_2d dtfft_plan Subroutine

Allocates pointer of rank 2

mem_alloc_r32_3d dtfft_plan Subroutine

Allocates pointer of rank 3

mem_alloc_r64_1d dtfft_plan Subroutine

Allocates pointer of rank 1

mem_alloc_r64_2d dtfft_plan Subroutine

Allocates pointer of rank 2

mem_alloc_r64_3d dtfft_plan Subroutine

Allocates pointer of rank 3

mem_free dtfft_transpose_plan Subroutine

Frees memory allocated with mem_alloc

mem_free dtfft_executor_fftw_m Subroutine

Frees FFTW3 aligned memory

mem_free dtfft_executor_mkl_m Subroutine

Frees MKL aligned memory

mem_free dtfft_executor_cufft_m Subroutine

Dummy method. Raises error stop

mem_free dtfft_executor_vkfft_m Subroutine

Dummy method. Raises error stop

mem_free_c32_1d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_c32_2d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_c32_3d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_c64_1d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_c64_2d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_c64_3d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_host dtfft_utils Interface
mem_free_ptr dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_r32_1d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_r32_2d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_r32_3d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_r64_1d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_r64_2d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mem_free_r64_3d dtfft_plan Subroutine

Frees previously allocated memory specific for this plan

mkl_dfti_commit_desc dtfft_interface_mkl_m Interface
mkl_dfti_create_desc dtfft_interface_mkl_m Interface
mkl_dfti_execute dtfft_interface_mkl_m Interface
mkl_dfti_free_desc dtfft_interface_mkl_m Interface
mkl_dfti_mem_alloc dtfft_interface_mkl_m Interface
mkl_dfti_mem_free dtfft_interface_mkl_m Interface
mkl_dfti_set_value dtfft_interface_mkl_m Interface

Sets one particular configuration parameter with the specified configuration value.

ncclCommDeregister dtfft_interface_nccl Interface

Deregister a buffer for collective communication.

ncclCommDestroy dtfft_interface_nccl Interface

Destroy a communicator object comm.

ncclCommInitRank dtfft_interface_nccl Interface

Creates a new communicator (multi thread/process version).

Read more…
ncclCommRegister dtfft_interface_nccl Interface

Register a buffer for collective communication.

ncclGetErrorString dtfft_interface_nccl Function

Generates an error message.

ncclGetErrorString_c dtfft_interface_nccl Interface

Returns a human-readable string corresponding to the passed error code.

ncclGetUniqueId dtfft_interface_nccl Interface

Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be called once when creating a communicator and the Id should be distributed to all ranks in the communicator before calling ncclCommInitRank. uniqueId should point to a ncclUniqueId object allocated by the user.

ncclGroupEnd dtfft_interface_nccl Interface

End a group call.

Read more…
ncclGroupStart dtfft_interface_nccl Interface

Start a group call.

Read more…
ncclMemAlloc dtfft_interface_nccl Interface

Allocate a GPU buffer with size. Allocated buffer head address will be returned by ptr, and the actual allocated size can be larger than requested because of the buffer granularity requirements from all types of NCCL optimizations.

ncclMemFree dtfft_interface_nccl Interface

Free memory allocated by ncclMemAlloc().

ncclRecv dtfft_interface_nccl Interface

Receive data from rank peer into recvbuff.

Read more…
ncclSend dtfft_interface_nccl Interface

Send data from sendbuff to rank peer.

Read more…
nvrtcGetErrorString dtfft_interface_nvrtc Function

Helper function that returns a string describing the given nvrtcResult code For unrecognized enumeration values, it returns “NVRTC_ERROR unknown”

nvshmem_finalize_ dtfft_interface_nvshmem Interface
nvshmem_free dtfft_interface_nvshmem Interface
nvshmem_malloc dtfft_interface_nvshmem Interface
nvshmem_my_pe dtfft_interface_nvshmem Interface
nvshmem_ptr dtfft_interface_nvshmem Interface
nvshmemx_float_alltoall_on_stream dtfft_interface_nvshmem Interface
nvshmemx_init_status dtfft_interface_nvshmem Interface
nvshmemx_sync_all_on_stream dtfft_interface_nvshmem Interface
nvtxDomainCreate_c dtfft_interface_nvtx Interface
nvtxDomainRangePop_c dtfft_interface_nvtx Interface
nvtxDomainRangePushEx_c dtfft_interface_nvtx Interface
operator(/=) dtfft_parameters Interface
operator(/=) dtfft_abstract_kernel Interface
operator(==) dtfft_parameters Interface
operator(==) dtfft_abstract_kernel Interface
operator(==) dtfft_kernel_host Interface
pencil_c2f dtfft_pencil Subroutine

Converts C pencil to Fortran pencil

pencil_f2c dtfft_pencil Subroutine

Converts Fortran pencil to C pencil

permute_backward_end_pipelined_read_f128 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor, contiguous reading, complex(real64) version

Read more…
permute_backward_end_pipelined_read_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor, contiguous reading, real(real32) version

Read more…
permute_backward_end_pipelined_read_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor, contiguous reading, real(real64) version

Read more…
permute_backward_end_pipelined_read_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_read_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f128 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor, contiguous writing, complex(real64) version

Read more…
permute_backward_end_pipelined_write_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor, contiguous writing, real(real32) version

Read more…
permute_backward_end_pipelined_write_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor, contiguous writing, real(real64) version

Read more…
permute_backward_end_pipelined_write_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_pipelined_write_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for a single neighbor

Read more…
permute_backward_end_read_f128 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors, contiguous reading, complex(real64) version

Read more…
permute_backward_end_read_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors, contiguous reading, real(real32) version

Read more…
permute_backward_end_read_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors, contiguous reading, real(real64) version

Read more…
permute_backward_end_read_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_read_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f128 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors, contiguous writing, complex(real64) version

Read more…
permute_backward_end_write_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors, contiguous writing, real(real32) version

Read more…
permute_backward_end_write_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors, contiguous writing, real(real64) version

Read more…
permute_backward_end_write_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_end_write_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation end of a 3D array for all neighbors

Read more…
permute_backward_read_f128 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays, contiguous reading, complex(real64) version

Read more…
permute_backward_read_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays, contiguous reading, real(real32) version

Read more…
permute_backward_read_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays, contiguous reading, real(real64) version

Read more…
permute_backward_read_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_read_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_start_read_f128 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array, contiguous reading, complex(real64) version

Read more…
permute_backward_start_read_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array, contiguous reading, real(real32) version

Read more…
permute_backward_start_read_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array, contiguous reading, real(real64) version

Read more…
permute_backward_start_read_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_read_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f128 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array, contiguous writing, complex(real64) version

Read more…
permute_backward_start_write_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array, contiguous writing, real(real32) version

Read more…
permute_backward_start_write_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array, contiguous writing, real(real64) version

Read more…
permute_backward_start_write_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_start_write_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation start of a 3D array

Read more…
permute_backward_write_f128 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays, contiguous writing, complex(real64) version

Read more…
permute_backward_write_f128_block_16 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f128_block_32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f128_block_64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays, contiguous writing, real(real32) version

Read more…
permute_backward_write_f32_block_16 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f32_block_32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f32_block_64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays, contiguous writing, real(real64) version

Read more…
permute_backward_write_f64_block_16 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f64_block_32 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_backward_write_f64_block_64 dtfft_kernel_host Subroutine

Backward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f128 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays, contiguous reading, complex(real64) version

Read more…
permute_forward_read_f128_block_16 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f128_block_32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f128_block_64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays, contiguous reading, real(real32) version

Read more…
permute_forward_read_f32_block_16 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f32_block_32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f32_block_64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays, contiguous reading, real(real64) version

Read more…
permute_forward_read_f64_block_16 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f64_block_32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_read_f64_block_64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f128 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays, contiguous writing, complex(real64) version

Read more…
permute_forward_write_f128_block_16 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f128_block_32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f128_block_64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays, contiguous writing, real(real32) version

Read more…
permute_forward_write_f32_block_16 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f32_block_32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f32_block_64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays, contiguous writing, real(real64) version

Read more…
permute_forward_write_f64_block_16 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f64_block_32 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
permute_forward_write_f64_block_64 dtfft_kernel_host Subroutine

Forward permutation of a 2D and 3D arrays

Read more…
platform_eq dtfft_parameters Function
platform_ne dtfft_parameters Function
pop_nvtx_domain_range dtfft_interface_nvtx Subroutine

Pops a range from the NVTX domain

precision_eq dtfft_parameters Function
precision_ne dtfft_parameters Function
push_nvtx_domain_range dtfft_interface_nvtx Subroutine

Pushes a range to the NVTX domain

r2r_kind_eq dtfft_parameters Function
r2r_kind_ne dtfft_parameters Function
report dtfft_plan Subroutine

Prints plan-related information to stdout

report_timings dtfft_transpose_plan Function
run_autotune_backend dtfft_transpose_plan Subroutine

Runs autotune for all backends Symmetric heap can be allocated after nvshmem_init, which is done during plan creation

run_autotune_datatypes dtfft_transpose_plan Subroutine
run_permute_backward test_host_kernels Subroutine
run_permute_backward_end test_host_kernels Subroutine
run_permute_backward_start test_host_kernels Subroutine
run_permute_forward test_host_kernels Subroutine
run_unpack test_host_kernels Subroutine
select_access_mode_f128 dtfft_kernel_host Subroutine

Selects the best access mode for host kernels, complex(real64) version

select_access_mode_f32 dtfft_kernel_host Subroutine

Selects the best access mode for host kernels, real(real32) version

select_access_mode_f64 dtfft_kernel_host Subroutine

Selects the best access mode for host kernels, real(real64) version

select_kernel dtfft_kernel_host Function

Selects the kernel implementation based on the given id and base storage size

set_name_expression dtfft_nvrtc_module Subroutine

Sets name expression for given template parameters to nvRTC program

set_unpack_kernel dtfft_abstract_backend Subroutine

Sets unpack kernel for pipelined backend

sort_by_varying_dim dtfft_pencil Subroutine
sort_candidates_by_score dtfft_nvrtc_block_optimizer Subroutine

Sorting candidates by their performance scores

stream_from_int64 dtfft_parameters Function

Creates dtfft_stream_t from integer(cuda_stream_kind)

string dtfft_utils Interface

Creates string object

string_c2f dtfft_utils Subroutine

Convert C string to Fortran string

string_constructor dtfft_utils Function

Creates string object

string_f2c dtfft_utils Subroutine

Convert Fortran string to C string

to_str dtfft_utils Interface

Convert various types to string

transpose dtfft_plan Subroutine

Performs single transposition

Read more…
transpose_end dtfft_plan Subroutine

Ends previously started transposition

transpose_private dtfft_plan Subroutine

Performs single transposition using type(c_ptr) pointers instead of buffers

Read more…
transpose_ptr dtfft_plan Subroutine

Performs single transposition using type(c_ptr) pointers instead of buffers

Read more…
transpose_start dtfft_plan Function

Starts an asynchronous transpose operation

Read more…
transpose_start_ptr dtfft_plan Function

Starts an asynchronous transpose operation using type(c_ptr) pointers instead of buffers

Read more…
transpose_type_eq dtfft_parameters Function
transpose_type_ne dtfft_parameters Function
unload_library dtfft_utils Subroutine

Unloads library

unpack_f128 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks, complex(real64) version.

unpack_f128_block_16 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f128_block_32 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f128_block_64 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f32 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks, real(real32) version.

unpack_f32_block_16 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f32_block_32 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f32_block_64 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f64 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks, real(real64) version.

unpack_f64_block_16 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f64_block_32 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_f64_block_64 dtfft_kernel_host Subroutine

Unpacks pack of contiguous buffer recieved from all ranks.

unpack_pipelined_f128 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank, complex(real64) version.

Read more…
unpack_pipelined_f128_block_16 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f128_block_32 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f128_block_64 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f32 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank, real(real32) version.

Read more…
unpack_pipelined_f32_block_16 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f32_block_32 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f32_block_64 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f64 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank, real(real64) version.

Read more…
unpack_pipelined_f64_block_16 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f64_block_32 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
unpack_pipelined_f64_block_64 dtfft_kernel_host Subroutine

Unpacks part of contiguous buffer recieved from a single rank.

Read more…
write_message dtfft_utils Subroutine

Write message to the specified unit

call~~graph~~CallGraph interface~aligned_alloc aligned_alloc interface~comm_f2c Comm_f2c interface~cudadevicesynchronize cudaDeviceSynchronize interface~cudaeventcreate cudaEventCreate interface~cudaeventcreatewithflags cudaEventCreateWithFlags interface~cudaeventdestroy cudaEventDestroy interface~cudaeventelapsedtime cudaEventElapsedTime interface~cudaeventrecord cudaEventRecord interface~cudaeventsynchronize cudaEventSynchronize interface~cudafree cudaFree interface~cudagetdevice cudaGetDevice interface~cudagetdevicecount cudaGetDeviceCount interface~cudageterrorstring_c cudaGetErrorString_c interface~cudagetlasterror cudaGetLastError interface~cudamalloc cudaMalloc interface~cudamemcpy cudaMemcpy interface~cudamemcpyasync cudaMemcpyAsync interface~cudamemgetinfo cudaMemGetInfo interface~cudamemset cudaMemset interface~cudasetdevice cudaSetDevice interface~cudastreamcreate cudaStreamCreate interface~cudastreamdestroy cudaStreamDestroy interface~cudastreamquery cudaStreamQuery interface~cudastreamsynchronize cudaStreamSynchronize interface~cudastreamwaitevent cudaStreamWaitEvent interface~cufftdestroy cufftDestroy interface~cufftmpattachreshapecomm cufftMpAttachReshapeComm interface~cufftmpcreatereshape cufftMpCreateReshape interface~cufftmpdestroyreshape cufftMpDestroyReshape interface~cufftmpexecreshapeasync cufftMpExecReshapeAsync interface~cufftmpgetreshapesize cufftMpGetReshapeSize interface~cufftmpmakereshape cufftMpMakeReshape interface~cufftplanmany cufftPlanMany interface~cufftsetstream cufftSetStream interface~cufftxtexec cufftXtExec interface~dftierrormessage_c DftiErrorMessage_c interface~dlclose dlclose interface~dlerror dlerror interface~dlopen dlopen interface~dlsym dlsym interface~dtfft_config_t dtfft_config_t proc~config_constructor config_constructor interface~dtfft_config_t->proc~config_constructor interface~dtfft_get_version dtfft_get_version proc~dtfft_get_version_current dtfft_get_version_current interface~dtfft_get_version->proc~dtfft_get_version_current proc~dtfft_get_version_required dtfft_get_version_required interface~dtfft_get_version->proc~dtfft_get_version_required interface~dtfft_pencil_t dtfft_pencil_t proc~create_pencil_t create_pencil_t interface~dtfft_pencil_t->proc~create_pencil_t interface~dtfft_stream_t dtfft_stream_t proc~stream_from_int64 stream_from_int64 interface~dtfft_stream_t->proc~stream_from_int64 interface~fftw_execute_dft fftw_execute_dft interface~fftw_execute_dft_c2r fftw_execute_dft_c2r interface~fftw_execute_dft_r2c fftw_execute_dft_r2c interface~fftw_execute_r2r fftw_execute_r2r interface~fftw_plan_many_dft fftw_plan_many_dft interface~fftw_plan_many_dft_c2r fftw_plan_many_dft_c2r interface~fftw_plan_many_dft_r2c fftw_plan_many_dft_r2c interface~fftw_plan_many_r2r fftw_plan_many_r2r interface~fftwf_execute_dft fftwf_execute_dft interface~fftwf_execute_dft_c2r fftwf_execute_dft_c2r interface~fftwf_execute_dft_r2c fftwf_execute_dft_r2c interface~fftwf_execute_r2r fftwf_execute_r2r interface~fftwf_plan_many_dft fftwf_plan_many_dft interface~fftwf_plan_many_dft_c2r fftwf_plan_many_dft_c2r interface~fftwf_plan_many_dft_r2c fftwf_plan_many_dft_r2c interface~fftwf_plan_many_r2r fftwf_plan_many_r2r interface~get_conf_internal get_conf_internal proc~get_conf_internal_int32 get_conf_internal_int32 interface~get_conf_internal->proc~get_conf_internal_int32 proc~get_conf_internal_logical get_conf_internal_logical interface~get_conf_internal->proc~get_conf_internal_logical interface~get_device_props get_device_props interface~get_env get_env proc~get_env_base get_env_base interface~get_env->proc~get_env_base proc~get_env_int32 get_env_int32 interface~get_env->proc~get_env_int32 proc~get_env_int8 get_env_int8 interface~get_env->proc~get_env_int8 proc~get_env_logical get_env_logical interface~get_env->proc~get_env_logical proc~get_env_string get_env_string interface~get_env->proc~get_env_string interface~init_nvshmem init_nvshmem interface~is_device_ptr is_device_ptr interface~is_null_ptr is_null_ptr interface~is_null_ptr->interface~is_null_ptr proc~is_null_funptr is_null_funptr interface~is_null_ptr->proc~is_null_funptr interface~mem_free_host mem_free_host interface~mkl_dfti_commit_desc mkl_dfti_commit_desc interface~mkl_dfti_create_desc mkl_dfti_create_desc interface~mkl_dfti_execute mkl_dfti_execute interface~mkl_dfti_free_desc mkl_dfti_free_desc interface~mkl_dfti_mem_alloc mkl_dfti_mem_alloc interface~mkl_dfti_mem_free mkl_dfti_mem_free interface~mkl_dfti_set_value mkl_dfti_set_value interface~ncclcommderegister ncclCommDeregister interface~ncclcommdestroy ncclCommDestroy interface~ncclcomminitrank ncclCommInitRank interface~ncclcommregister ncclCommRegister interface~ncclgeterrorstring_c ncclGetErrorString_c interface~ncclgetuniqueid ncclGetUniqueId interface~ncclgroupend ncclGroupEnd interface~ncclgroupstart ncclGroupStart interface~ncclmemalloc ncclMemAlloc interface~ncclmemfree ncclMemFree interface~ncclrecv ncclRecv interface~ncclsend ncclSend interface~nvshmem_finalize_ nvshmem_finalize_ interface~nvshmem_free nvshmem_free interface~nvshmem_malloc nvshmem_malloc interface~nvshmem_my_pe nvshmem_my_pe interface~nvshmem_ptr nvshmem_ptr interface~nvshmemx_float_alltoall_on_stream nvshmemx_float_alltoall_on_stream interface~nvshmemx_init_status nvshmemx_init_status interface~nvshmemx_sync_all_on_stream nvshmemx_sync_all_on_stream interface~nvtxdomaincreate_c nvtxDomainCreate_c interface~nvtxdomainrangepop_c nvtxDomainRangePop_c interface~nvtxdomainrangepushex_c nvtxDomainRangePushEx_c interface~operator(==) operator(==) proc~backend_eq backend_eq interface~operator(==)->proc~backend_eq proc~effort_eq effort_eq interface~operator(==)->proc~effort_eq proc~exec_eq exec_eq interface~operator(==)->proc~exec_eq proc~execute_type_eq execute_type_eq interface~operator(==)->proc~execute_type_eq proc~executor_eq executor_eq interface~operator(==)->proc~executor_eq proc~platform_eq platform_eq interface~operator(==)->proc~platform_eq proc~precision_eq precision_eq interface~operator(==)->proc~precision_eq proc~r2r_kind_eq r2r_kind_eq interface~operator(==)->proc~r2r_kind_eq proc~transpose_type_eq transpose_type_eq interface~operator(==)->proc~transpose_type_eq interface~operator(==)~2 operator(==) proc~kernel_type_eq kernel_type_eq interface~operator(==)~2->proc~kernel_type_eq interface~operator(==)~3 operator(==) proc~host_kernel_eq host_kernel_eq interface~operator(==)~3->proc~host_kernel_eq interface~operator(SLASH=) operator(/=) proc~backend_ne backend_ne interface~operator(SLASH=)->proc~backend_ne proc~effort_ne effort_ne interface~operator(SLASH=)->proc~effort_ne proc~execute_type_ne execute_type_ne interface~operator(SLASH=)->proc~execute_type_ne proc~executor_ne executor_ne interface~operator(SLASH=)->proc~executor_ne proc~platform_ne platform_ne interface~operator(SLASH=)->proc~platform_ne proc~precision_ne precision_ne interface~operator(SLASH=)->proc~precision_ne proc~r2r_kind_ne r2r_kind_ne interface~operator(SLASH=)->proc~r2r_kind_ne proc~transpose_type_ne transpose_type_ne interface~operator(SLASH=)->proc~transpose_type_ne interface~operator(SLASH=)~2 operator(/=) proc~kernel_type_ne kernel_type_ne interface~operator(SLASH=)~2->proc~kernel_type_ne interface~string string proc~string_constructor string_constructor interface~string->proc~string_constructor interface~to_str to_str proc~double_to_string double_to_string interface~to_str->proc~double_to_string proc~float_to_string float_to_string interface~to_str->proc~float_to_string proc~int32_to_string int32_to_string interface~to_str->proc~int32_to_string proc~int64_to_string int64_to_string interface~to_str->proc~int64_to_string proc~int8_to_string int8_to_string interface~to_str->proc~int8_to_string none~check nvrtc_module%check proc~check_instance nvrtc_module%check_instance none~check->proc~check_instance proc~check_module nvrtc_module%check_module none~check->proc~check_module none~create~12 dtfft_plan_c2c_t%create proc~create_c2c dtfft_plan_c2c_t%create_c2c none~create~12->proc~create_c2c proc~create_c2c_pencil dtfft_plan_c2c_t%create_c2c_pencil none~create~12->proc~create_c2c_pencil none~create~13 dtfft_plan_r2c_t%create proc~create_r2c dtfft_plan_r2c_t%create_r2c none~create~13->proc~create_r2c proc~create_r2c_pencil dtfft_plan_r2c_t%create_r2c_pencil none~create~13->proc~create_r2c_pencil none~create~14 dtfft_plan_r2r_t%create proc~create_r2r dtfft_plan_r2r_t%create_r2r none~create~14->proc~create_r2r proc~create_r2r_pencil dtfft_plan_r2r_t%create_r2r_pencil none~create~14->proc~create_r2r_pencil none~get_stream dtfft_plan_t%get_stream proc~get_stream_int64 dtfft_plan_t%get_stream_int64 none~get_stream->proc~get_stream_int64 proc~get_stream_ptr dtfft_plan_t%get_stream_ptr none~get_stream->proc~get_stream_ptr none~get_stream~2 dtfft_core_c2c%get_stream none~get_stream~2->proc~get_stream_int64 none~get_stream~2->proc~get_stream_ptr none~get_stream~3 dtfft_plan_r2r_t%get_stream none~get_stream~3->proc~get_stream_int64 none~get_stream~3->proc~get_stream_ptr none~get_stream~4 dtfft_plan_c2c_t%get_stream none~get_stream~4->proc~get_stream_int64 none~get_stream~4->proc~get_stream_ptr none~get_stream~5 dtfft_plan_r2c_t%get_stream none~get_stream~5->proc~get_stream_int64 none~get_stream~5->proc~get_stream_ptr none~mem_alloc~10 dtfft_plan_r2c_t%mem_alloc proc~mem_alloc_c32_1d dtfft_plan_t%mem_alloc_c32_1d none~mem_alloc~10->proc~mem_alloc_c32_1d proc~mem_alloc_c32_2d dtfft_plan_t%mem_alloc_c32_2d none~mem_alloc~10->proc~mem_alloc_c32_2d proc~mem_alloc_c32_3d dtfft_plan_t%mem_alloc_c32_3d none~mem_alloc~10->proc~mem_alloc_c32_3d proc~mem_alloc_c64_1d dtfft_plan_t%mem_alloc_c64_1d none~mem_alloc~10->proc~mem_alloc_c64_1d proc~mem_alloc_c64_2d dtfft_plan_t%mem_alloc_c64_2d none~mem_alloc~10->proc~mem_alloc_c64_2d proc~mem_alloc_c64_3d dtfft_plan_t%mem_alloc_c64_3d none~mem_alloc~10->proc~mem_alloc_c64_3d proc~mem_alloc_r32_1d dtfft_plan_t%mem_alloc_r32_1d none~mem_alloc~10->proc~mem_alloc_r32_1d proc~mem_alloc_r32_2d dtfft_plan_t%mem_alloc_r32_2d none~mem_alloc~10->proc~mem_alloc_r32_2d proc~mem_alloc_r32_3d dtfft_plan_t%mem_alloc_r32_3d none~mem_alloc~10->proc~mem_alloc_r32_3d proc~mem_alloc_r64_1d dtfft_plan_t%mem_alloc_r64_1d none~mem_alloc~10->proc~mem_alloc_r64_1d proc~mem_alloc_r64_2d dtfft_plan_t%mem_alloc_r64_2d none~mem_alloc~10->proc~mem_alloc_r64_2d proc~mem_alloc_r64_3d dtfft_plan_t%mem_alloc_r64_3d none~mem_alloc~10->proc~mem_alloc_r64_3d none~mem_alloc~11 dtfft_core_c2c%mem_alloc none~mem_alloc~11->proc~mem_alloc_c32_1d none~mem_alloc~11->proc~mem_alloc_c32_2d none~mem_alloc~11->proc~mem_alloc_c32_3d none~mem_alloc~11->proc~mem_alloc_c64_1d none~mem_alloc~11->proc~mem_alloc_c64_2d none~mem_alloc~11->proc~mem_alloc_c64_3d none~mem_alloc~11->proc~mem_alloc_r32_1d none~mem_alloc~11->proc~mem_alloc_r32_2d none~mem_alloc~11->proc~mem_alloc_r32_3d none~mem_alloc~11->proc~mem_alloc_r64_1d none~mem_alloc~11->proc~mem_alloc_r64_2d none~mem_alloc~11->proc~mem_alloc_r64_3d none~mem_alloc~7 dtfft_plan_t%mem_alloc none~mem_alloc~7->proc~mem_alloc_c32_1d none~mem_alloc~7->proc~mem_alloc_c32_2d none~mem_alloc~7->proc~mem_alloc_c32_3d none~mem_alloc~7->proc~mem_alloc_c64_1d none~mem_alloc~7->proc~mem_alloc_c64_2d none~mem_alloc~7->proc~mem_alloc_c64_3d none~mem_alloc~7->proc~mem_alloc_r32_1d none~mem_alloc~7->proc~mem_alloc_r32_2d none~mem_alloc~7->proc~mem_alloc_r32_3d none~mem_alloc~7->proc~mem_alloc_r64_1d none~mem_alloc~7->proc~mem_alloc_r64_2d none~mem_alloc~7->proc~mem_alloc_r64_3d none~mem_alloc~8 dtfft_plan_r2r_t%mem_alloc none~mem_alloc~8->proc~mem_alloc_c32_1d none~mem_alloc~8->proc~mem_alloc_c32_2d none~mem_alloc~8->proc~mem_alloc_c32_3d none~mem_alloc~8->proc~mem_alloc_c64_1d none~mem_alloc~8->proc~mem_alloc_c64_2d none~mem_alloc~8->proc~mem_alloc_c64_3d none~mem_alloc~8->proc~mem_alloc_r32_1d none~mem_alloc~8->proc~mem_alloc_r32_2d none~mem_alloc~8->proc~mem_alloc_r32_3d none~mem_alloc~8->proc~mem_alloc_r64_1d none~mem_alloc~8->proc~mem_alloc_r64_2d none~mem_alloc~8->proc~mem_alloc_r64_3d none~mem_alloc~9 dtfft_plan_c2c_t%mem_alloc none~mem_alloc~9->proc~mem_alloc_c32_1d none~mem_alloc~9->proc~mem_alloc_c32_2d none~mem_alloc~9->proc~mem_alloc_c32_3d none~mem_alloc~9->proc~mem_alloc_c64_1d none~mem_alloc~9->proc~mem_alloc_c64_2d none~mem_alloc~9->proc~mem_alloc_c64_3d none~mem_alloc~9->proc~mem_alloc_r32_1d none~mem_alloc~9->proc~mem_alloc_r32_2d none~mem_alloc~9->proc~mem_alloc_r32_3d none~mem_alloc~9->proc~mem_alloc_r64_1d none~mem_alloc~9->proc~mem_alloc_r64_2d none~mem_alloc~9->proc~mem_alloc_r64_3d none~mem_free~10 dtfft_plan_c2c_t%mem_free proc~mem_free_c32_1d dtfft_plan_t%mem_free_c32_1d none~mem_free~10->proc~mem_free_c32_1d proc~mem_free_c32_2d dtfft_plan_t%mem_free_c32_2d none~mem_free~10->proc~mem_free_c32_2d proc~mem_free_c32_3d dtfft_plan_t%mem_free_c32_3d none~mem_free~10->proc~mem_free_c32_3d proc~mem_free_c64_1d dtfft_plan_t%mem_free_c64_1d none~mem_free~10->proc~mem_free_c64_1d proc~mem_free_c64_2d dtfft_plan_t%mem_free_c64_2d none~mem_free~10->proc~mem_free_c64_2d proc~mem_free_c64_3d dtfft_plan_t%mem_free_c64_3d none~mem_free~10->proc~mem_free_c64_3d proc~mem_free_r32_1d dtfft_plan_t%mem_free_r32_1d none~mem_free~10->proc~mem_free_r32_1d proc~mem_free_r32_2d dtfft_plan_t%mem_free_r32_2d none~mem_free~10->proc~mem_free_r32_2d proc~mem_free_r32_3d dtfft_plan_t%mem_free_r32_3d none~mem_free~10->proc~mem_free_r32_3d proc~mem_free_r64_1d dtfft_plan_t%mem_free_r64_1d none~mem_free~10->proc~mem_free_r64_1d proc~mem_free_r64_2d dtfft_plan_t%mem_free_r64_2d none~mem_free~10->proc~mem_free_r64_2d proc~mem_free_r64_3d dtfft_plan_t%mem_free_r64_3d none~mem_free~10->proc~mem_free_r64_3d none~mem_free~11 dtfft_core_c2c%mem_free none~mem_free~11->proc~mem_free_c32_1d none~mem_free~11->proc~mem_free_c32_2d none~mem_free~11->proc~mem_free_c32_3d none~mem_free~11->proc~mem_free_c64_1d none~mem_free~11->proc~mem_free_c64_2d none~mem_free~11->proc~mem_free_c64_3d none~mem_free~11->proc~mem_free_r32_1d none~mem_free~11->proc~mem_free_r32_2d none~mem_free~11->proc~mem_free_r32_3d none~mem_free~11->proc~mem_free_r64_1d none~mem_free~11->proc~mem_free_r64_2d none~mem_free~11->proc~mem_free_r64_3d none~mem_free~7 dtfft_plan_t%mem_free none~mem_free~7->proc~mem_free_c32_1d none~mem_free~7->proc~mem_free_c32_2d none~mem_free~7->proc~mem_free_c32_3d none~mem_free~7->proc~mem_free_c64_1d none~mem_free~7->proc~mem_free_c64_2d none~mem_free~7->proc~mem_free_c64_3d none~mem_free~7->proc~mem_free_r32_1d none~mem_free~7->proc~mem_free_r32_2d none~mem_free~7->proc~mem_free_r32_3d none~mem_free~7->proc~mem_free_r64_1d none~mem_free~7->proc~mem_free_r64_2d none~mem_free~7->proc~mem_free_r64_3d none~mem_free~8 dtfft_plan_r2c_t%mem_free none~mem_free~8->proc~mem_free_c32_1d none~mem_free~8->proc~mem_free_c32_2d none~mem_free~8->proc~mem_free_c32_3d none~mem_free~8->proc~mem_free_c64_1d none~mem_free~8->proc~mem_free_c64_2d none~mem_free~8->proc~mem_free_c64_3d none~mem_free~8->proc~mem_free_r32_1d none~mem_free~8->proc~mem_free_r32_2d none~mem_free~8->proc~mem_free_r32_3d none~mem_free~8->proc~mem_free_r64_1d none~mem_free~8->proc~mem_free_r64_2d none~mem_free~8->proc~mem_free_r64_3d none~mem_free~9 dtfft_plan_r2r_t%mem_free none~mem_free~9->proc~mem_free_c32_1d none~mem_free~9->proc~mem_free_c32_2d none~mem_free~9->proc~mem_free_c32_3d none~mem_free~9->proc~mem_free_c64_1d none~mem_free~9->proc~mem_free_c64_2d none~mem_free~9->proc~mem_free_c64_3d none~mem_free~9->proc~mem_free_r32_1d none~mem_free~9->proc~mem_free_r32_2d none~mem_free~9->proc~mem_free_r32_3d none~mem_free~9->proc~mem_free_r64_1d none~mem_free~9->proc~mem_free_r64_2d none~mem_free~9->proc~mem_free_r64_3d proc~add nvrtc_module_cache%add proc~add->interface~to_str proc~create~9 nvrtc_module_cache%create proc~add->proc~create~9 proc~get_conf_log_enabled get_conf_log_enabled proc~add->proc~get_conf_log_enabled proc~write_message write_message proc~add->proc~write_message proc~add_line codegen_t%add_line proc~alloc_and_set_aux alloc_and_set_aux proc~alloc_mem alloc_mem proc~alloc_and_set_aux->proc~alloc_mem proc~dtfft_get_error_string dtfft_get_error_string proc~alloc_and_set_aux->proc~dtfft_get_error_string proc~get_aux_size_generic get_aux_size_generic proc~alloc_and_set_aux->proc~get_aux_size_generic mpi_abort mpi_abort proc~alloc_and_set_aux->mpi_abort mpi_allreduce mpi_allreduce proc~alloc_and_set_aux->mpi_allreduce proc~alloc_fft_plans dtfft_plan_t%alloc_fft_plans proc~alloc_mem->interface~cudamalloc proc~alloc_mem->interface~cudamemgetinfo proc~alloc_mem->interface~ncclcommregister proc~alloc_mem->interface~ncclmemalloc proc~alloc_mem->interface~nvshmem_malloc proc~alloc_mem->interface~to_str proc~cudageterrorstring cudaGetErrorString proc~alloc_mem->proc~cudageterrorstring proc~dtfft_get_backend_string dtfft_get_backend_string proc~alloc_mem->proc~dtfft_get_backend_string proc~alloc_mem->proc~get_conf_log_enabled proc~is_backend_nccl is_backend_nccl proc~alloc_mem->proc~is_backend_nccl proc~is_backend_nvshmem is_backend_nvshmem proc~alloc_mem->proc~is_backend_nvshmem proc~mem_alloc_host mem_alloc_host proc~alloc_mem->proc~mem_alloc_host proc~ncclgeterrorstring ncclGetErrorString proc~alloc_mem->proc~ncclgeterrorstring proc~alloc_mem->proc~write_message fname fname proc~alloc_mem->fname is_null_ptr is_null_ptr proc~alloc_mem->is_null_ptr proc~alloc_mem->mpi_abort proc~alloc_mem->mpi_allreduce temp temp proc~alloc_mem->temp proc~allocate_plans allocate_plans proc~astring_f2c astring_f2c proc~string_f2c string_f2c proc~astring_f2c->proc~string_f2c proc~autotune_grid autotune_grid proc~autotune_grid->interface~to_str proc~create_pencils_and_comm create_pencils_and_comm proc~autotune_grid->proc~create_pencils_and_comm proc~destroy~5 pencil%destroy proc~autotune_grid->proc~destroy~5 proc~autotune_grid->proc~get_conf_log_enabled proc~pop_nvtx_domain_range pop_nvtx_domain_range proc~autotune_grid->proc~pop_nvtx_domain_range proc~push_nvtx_domain_range push_nvtx_domain_range proc~autotune_grid->proc~push_nvtx_domain_range proc~run_autotune_backend run_autotune_backend proc~autotune_grid->proc~run_autotune_backend proc~autotune_grid->proc~write_message mpi_comm_free mpi_comm_free proc~autotune_grid->mpi_comm_free proc~autotune_grid_decomposition autotune_grid_decomposition proc~autotune_grid_decomposition->proc~autotune_grid mpi_comm_size mpi_comm_size proc~autotune_grid_decomposition->mpi_comm_size proc~autotune_transpose_id autotune_transpose_id proc~get_plan_execution_time get_plan_execution_time proc~autotune_transpose_id->proc~get_plan_execution_time proc~check_aux dtfft_plan_t%check_aux proc~check_aux->interface~to_str proc~check_aux->proc~dtfft_get_error_string proc~get_alloc_size dtfft_plan_t%get_alloc_size proc~check_aux->proc~get_alloc_size proc~check_aux->proc~get_conf_log_enabled proc~get_element_size dtfft_plan_t%get_element_size proc~check_aux->proc~get_element_size proc~mem_alloc_ptr dtfft_plan_t%mem_alloc_ptr proc~check_aux->proc~mem_alloc_ptr proc~check_aux->proc~write_message proc~check_aux->is_null_ptr proc~check_aux->mpi_abort proc~check_continuity check_continuity proc~check_create_args dtfft_plan_t%check_create_args proc~get_conf_backend get_conf_backend proc~check_create_args->proc~get_conf_backend proc~get_conf_platform get_conf_platform proc~check_create_args->proc~get_conf_platform proc~init_internal init_internal proc~check_create_args->proc~init_internal proc~check_create_args->proc~is_backend_nccl proc~check_create_args->proc~is_backend_nvshmem proc~is_cuda_executor is_cuda_executor proc~check_create_args->proc~is_cuda_executor proc~is_host_executor is_host_executor proc~check_create_args->proc~is_host_executor proc~is_valid_comm_type is_valid_comm_type proc~check_create_args->proc~is_valid_comm_type proc~is_valid_dimension is_valid_dimension proc~check_create_args->proc~is_valid_dimension proc~is_valid_effort is_valid_effort proc~check_create_args->proc~is_valid_effort proc~is_valid_executor is_valid_executor proc~check_create_args->proc~is_valid_executor proc~is_valid_precision is_valid_precision proc~check_create_args->proc~is_valid_precision proc~is_valid_r2r_kind is_valid_r2r_kind proc~check_create_args->proc~is_valid_r2r_kind mpi_topo_test mpi_topo_test proc~check_create_args->mpi_topo_test proc~check_device_pointers check_device_pointers proc~check_device_pointers->interface~is_device_ptr proc~check_device_pointers->proc~is_backend_nvshmem proc~is_nvshmem_ptr is_nvshmem_ptr proc~check_device_pointers->proc~is_nvshmem_ptr proc~check_device_pointers->is_null_ptr proc~check_if_even check_if_even mpi_allgather mpi_allgather proc~check_if_even->mpi_allgather proc~check_if_even->mpi_comm_size proc~check_if_overflow check_if_overflow proc~check_instance->none~check proc~check_overlap check_overlap proc~compare compare proc~compile_program compile_program proc~compile_program->interface~to_str proc~compile_program->proc~astring_f2c proc~destroy_strings destroy_strings proc~compile_program->proc~destroy_strings proc~compile_program->proc~mem_alloc_host proc~nvrtcgeterrorstring nvrtcGetErrorString proc~compile_program->proc~nvrtcgeterrorstring proc~set_name_expression set_name_expression proc~compile_program->proc~set_name_expression proc~string_c2f string_c2f proc~compile_program->proc~string_c2f proc~compile_program->fname proc~compile_program->mpi_abort proc~compute_alltoall_schedule backend_mpi%compute_alltoall_schedule proc~count_bank_conflicts count_bank_conflicts proc~count_unique count_unique proc~create transpose_plan%create proc~create->interface~to_str proc~create->proc~alloc_and_set_aux proc~create->proc~allocate_plans proc~create->proc~autotune_grid_decomposition proc~create_helper~2 backend_helper%create_helper proc~create->proc~create_helper~2 proc~create->proc~create_pencils_and_comm proc~create->proc~dtfft_get_backend_string proc~create->proc~get_conf_backend proc~get_conf_datatype_enabled get_conf_datatype_enabled proc~create->proc~get_conf_datatype_enabled proc~create->proc~get_conf_log_enabled proc~get_conf_mpi_enabled get_conf_mpi_enabled proc~create->proc~get_conf_mpi_enabled proc~get_conf_nccl_enabled get_conf_nccl_enabled proc~create->proc~get_conf_nccl_enabled proc~get_conf_nvshmem_enabled get_conf_nvshmem_enabled proc~create->proc~get_conf_nvshmem_enabled proc~get_conf_stream get_conf_stream proc~create->proc~get_conf_stream proc~get_conf_z_slab_enabled get_conf_z_slab_enabled proc~create->proc~get_conf_z_slab_enabled proc~get_datatype_from_env get_datatype_from_env proc~create->proc~get_datatype_from_env proc~get_local_sizes get_local_sizes proc~create->proc~get_local_sizes proc~create->proc~is_backend_nccl proc~load_cuda load_cuda proc~create->proc~load_cuda proc~load_nvrtc load_nvrtc proc~create->proc~load_nvrtc proc~create->proc~run_autotune_backend proc~create->proc~write_message mpi_cart_get mpi_cart_get proc~create->mpi_cart_get mpi_cartdim_get mpi_cartdim_get proc~create->mpi_cartdim_get proc~create->mpi_comm_size mpi_dims_create mpi_dims_create proc~create->mpi_dims_create proc~create->mpi_topo_test mpi_wtime mpi_wtime proc~create->mpi_wtime temp_coords temp_coords proc~create->temp_coords temp_dims temp_dims proc~create->temp_dims temp_periods temp_periods proc~create->temp_periods proc~create_1d_comm create_1d_comm proc~create_subcomm create_subcomm proc~create_1d_comm->proc~create_subcomm proc~get_varying_dim get_varying_dim proc~create_1d_comm->proc~get_varying_dim proc~sort_by_varying_dim sort_by_varying_dim proc~create_1d_comm->proc~sort_by_varying_dim proc~create_1d_comm->mpi_comm_size proc~create_back_permutation create_back_permutation proc~free_datatypes free_datatypes proc~create_back_permutation->proc~free_datatypes mpi_type_commit mpi_type_commit proc~create_back_permutation->mpi_type_commit mpi_type_contiguous mpi_type_contiguous proc~create_back_permutation->mpi_type_contiguous mpi_type_create_hvector mpi_type_create_hvector proc~create_back_permutation->mpi_type_create_hvector mpi_type_create_resized mpi_type_create_resized proc~create_back_permutation->mpi_type_create_resized mpi_type_vector mpi_type_vector proc~create_back_permutation->mpi_type_vector proc~create_c2c_internal dtfft_plan_c2c_t%create_c2c_internal proc~create_c2c->proc~create_c2c_internal proc~create_c2c->proc~dtfft_get_error_string proc~create_c2c->proc~write_message proc~create_c2c_core dtfft_core_c2c%create_c2c_core proc~create_private dtfft_plan_t%create_private proc~create_c2c_core->proc~create_private create create proc~create_c2c_core->create fft_mapping fft_mapping proc~create_c2c_core->fft_mapping pencils pencils proc~create_c2c_core->pencils proc~create_c2c_internal->proc~create_c2c_core proc~create_c2c_internal->proc~pop_nvtx_domain_range proc~create_c2c_internal->proc~push_nvtx_domain_range proc~create_c2c_pencil->proc~create_c2c_internal proc~create_c2c_pencil->proc~dtfft_get_error_string proc~create_c2c_pencil->proc~write_message proc~create_cart_comm create_cart_comm proc~create_cart_comm->interface~to_str proc~create_subcomm_include_all create_subcomm_include_all proc~create_cart_comm->proc~create_subcomm_include_all mpi_cart_create mpi_cart_create proc~create_cart_comm->mpi_cart_create mpi_cart_sub mpi_cart_sub proc~create_cart_comm->mpi_cart_sub mpi_comm_dup mpi_comm_dup proc~create_cart_comm->mpi_comm_dup proc~create_cart_comm->mpi_comm_free proc~create_data_handle data_handle%create_data_handle proc~create_data_handle->mpi_allgather proc~create_forw_permutation create_forw_permutation proc~create_forw_permutation->proc~free_datatypes proc~create_forw_permutation->mpi_type_commit proc~create_forw_permutation->mpi_type_contiguous proc~create_forw_permutation->mpi_type_create_hvector proc~create_forw_permutation->mpi_type_create_resized proc~create_forw_permutation->mpi_type_vector proc~create_handle handle_t%create_handle proc~destroy_handle handle_t%destroy_handle proc~create_handle->proc~destroy_handle proc~create_helper mpi_backend_helper%create_helper proc~create_helper~2->interface~get_env proc~create_helper~2->interface~ncclcomminitrank proc~create_helper~2->interface~ncclgetuniqueid proc~destroy_helper~2 backend_helper%destroy_helper proc~create_helper~2->proc~destroy_helper~2 proc~create_helper~2->proc~ncclgeterrorstring proc~create_helper~2->fname proc~create_helper~2->mpi_abort proc~create_helper~2->mpi_allgather mpi_bcast mpi_bcast proc~create_helper~2->mpi_bcast mpi_comm_rank mpi_comm_rank proc~create_helper~2->mpi_comm_rank proc~create_helper~2->mpi_comm_size proc~create_host kernel_host%create_host proc~create_host->interface~to_str proc~execute_benchmark kernel_host%execute_benchmark proc~create_host->proc~execute_benchmark proc~get_conf_configs_to_test get_conf_configs_to_test proc~create_host->proc~get_conf_configs_to_test proc~get_conf_forced_kernel_optimization get_conf_forced_kernel_optimization proc~create_host->proc~get_conf_forced_kernel_optimization proc~get_conf_kernel_optimization_enabled get_conf_kernel_optimization_enabled proc~create_host->proc~get_conf_kernel_optimization_enabled proc~create_host->proc~get_conf_log_enabled proc~get_conf_measure_iters get_conf_measure_iters proc~create_host->proc~get_conf_measure_iters proc~get_conf_measure_warmup_iters get_conf_measure_warmup_iters proc~create_host->proc~get_conf_measure_warmup_iters proc~get_host_kernel_string get_host_kernel_string proc~create_host->proc~get_host_kernel_string proc~is_unpack_kernel is_unpack_kernel proc~create_host->proc~is_unpack_kernel proc~create_host->proc~pop_nvtx_domain_range proc~create_host->proc~push_nvtx_domain_range proc~select_access_mode_f128 kernel_host%select_access_mode_f128 proc~create_host->proc~select_access_mode_f128 proc~select_access_mode_f32 kernel_host%select_access_mode_f32 proc~create_host->proc~select_access_mode_f32 proc~select_access_mode_f64 kernel_host%select_access_mode_f64 proc~create_host->proc~select_access_mode_f64 proc~select_kernel select_kernel proc~create_host->proc~select_kernel proc~create_host->proc~write_message proc~create_mpi backend_mpi%create_mpi proc~create_mpi->proc~compute_alltoall_schedule proc~create_mpi->proc~create_helper proc~is_backend_mpi is_backend_mpi proc~create_mpi->proc~is_backend_mpi proc~create_nccl backend_nccl%create_nccl proc~create_nccl->proc~is_backend_nccl proc~create_nvrtc_module create_nvrtc_module proc~create_nvrtc_module->interface~to_str proc~create_nvrtc_module->none~check proc~create_nvrtc_module->proc~add proc~create~13 nvrtc_module%create proc~create_nvrtc_module->proc~create~13 proc~create_nvrtc_module->proc~get_conf_log_enabled proc~get_kernel_string get_kernel_string proc~create_nvrtc_module->proc~get_kernel_string proc~create_nvrtc_module->proc~write_message proc~create_nvtx_domain create_nvtx_domain proc~create_nvtx_domain->interface~nvtxdomaincreate_c proc~create_nvtx_domain->proc~astring_f2c proc~create_pencil_init pencil_init%create_pencil_init proc~create_pencil_init->proc~check_continuity proc~create_pencil_init->proc~check_overlap proc~create_pencil_init->proc~create_1d_comm proc~create_pencil_init->proc~dtfft_get_error_string proc~create_pencil_init->proc~write_message proc~create_pencil_init->mpi_allgather proc~create_pencil_init->mpi_allreduce proc~create_pencil_init->mpi_comm_rank proc~create_pencil_init->mpi_comm_size proc~destroy_pencil_t_private dtfft_pencil_t%destroy_pencil_t_private proc~create_pencil_t->proc~destroy_pencil_t_private proc~create_pencils_and_comm->proc~create_cart_comm proc~create~5 pencil%create proc~create_pencils_and_comm->proc~create~5 lcounts lcounts proc~create_pencils_and_comm->lcounts lstarts lstarts proc~create_pencils_and_comm->lstarts proc~create_private->interface~cudagetdevice proc~create_private->interface~cudagetdevicecount proc~create_private->proc~alloc_fft_plans proc~create_private->proc~check_create_args proc~create_private->proc~count_unique proc~create_private->proc~cudageterrorstring proc~create_private->proc~destroy~5 proc~create_private->proc~get_conf_stream proc~get_conf_y_slab_enabled get_conf_y_slab_enabled proc~create_private->proc~get_conf_y_slab_enabled proc~get_z_slab transpose_plan%get_z_slab proc~create_private->proc~get_z_slab counts counts proc~create_private->counts proc~create_private->create fixed_dims fixed_dims proc~create_private->fixed_dims proc~create_private->fname local_devices local_devices proc~create_private->local_devices proc~create_private->mpi_abort proc~create_private->mpi_allgather proc~create_private->mpi_comm_free proc~create_private->mpi_comm_rank proc~create_private->mpi_comm_size mpi_comm_split_type mpi_comm_split_type proc~create_private->mpi_comm_split_type proc~create_r2c_internal dtfft_plan_r2c_t%create_r2c_internal proc~create_r2c->proc~create_r2c_internal proc~create_r2c->proc~dtfft_get_error_string proc~create_r2c->proc~write_message proc~create_r2c_internal->proc~create_c2c_core proc~create_r2c_internal->proc~create~5 proc~create_r2c_internal->proc~pop_nvtx_domain_range proc~create_r2c_internal->proc~push_nvtx_domain_range proc~create_r2c_internal->pencils proc~create_r2c_pencil->proc~create_r2c_internal proc~create_r2c_pencil->proc~dtfft_get_error_string proc~create_r2c_pencil->proc~write_message proc~create_r2r_internal dtfft_plan_r2r_t%create_r2r_internal proc~create_r2r->proc~create_r2r_internal proc~create_r2r->proc~dtfft_get_error_string proc~create_r2r->proc~write_message proc~create_r2r_internal->proc~create_private proc~create_r2r_internal->proc~pop_nvtx_domain_range proc~create_r2r_internal->proc~push_nvtx_domain_range proc~create_r2r_internal->create proc~create_r2r_internal->fft_mapping proc~create_r2r_internal->pencils proc~create_r2r_pencil->proc~create_r2r_internal proc~create_r2r_pencil->proc~dtfft_get_error_string proc~create_r2r_pencil->proc~write_message mpi_comm_create mpi_comm_create proc~create_subcomm->mpi_comm_create mpi_comm_group mpi_comm_group proc~create_subcomm->mpi_comm_group mpi_group_free mpi_group_free proc~create_subcomm->mpi_group_free mpi_group_incl mpi_group_incl proc~create_subcomm->mpi_group_incl proc~create_subcomm_include_all->proc~create_subcomm proc~create_subcomm_include_all->mpi_comm_size proc~create_transpose_2d create_transpose_2d proc~create_transpose_2d->proc~free_datatypes proc~create_transpose_2d->mpi_type_commit proc~create_transpose_2d->mpi_type_contiguous proc~create_transpose_2d->mpi_type_create_resized proc~create_transpose_2d->mpi_type_vector proc~create_transpose_xz create_transpose_XZ proc~create_transpose_xz->proc~free_datatypes proc~create_transpose_xz->mpi_type_commit proc~create_transpose_xz->mpi_type_contiguous proc~create_transpose_xz->mpi_type_create_hvector proc~create_transpose_xz->mpi_type_create_resized proc~create_transpose_xz->mpi_type_vector proc~create_transpose_zx create_transpose_ZX proc~create_transpose_zx->proc~free_datatypes proc~create_transpose_zx->mpi_type_commit proc~create_transpose_zx->mpi_type_contiguous proc~create_transpose_zx->mpi_type_create_hvector proc~create_transpose_zx->mpi_type_create_resized proc~create_transpose_zx->mpi_type_vector proc~create~10 abstract_transpose_handle%create proc~get_transpose_type get_transpose_type proc~create~10->proc~get_transpose_type create_private create_private proc~create~10->create_private proc~create~11 mkl_executor%create proc~make_plan make_plan proc~create~11->proc~make_plan proc~create~12 cufft_executor%create proc~create~12->interface~cufftplanmany proc~create~12->interface~cufftsetstream proc~cufftgeterrorstring cufftGetErrorString proc~create~12->proc~cufftgeterrorstring proc~create~12->proc~get_conf_stream proc~create~12->fname proc~create~12->mpi_abort proc~create~13->interface~mem_free_host proc~create~13->proc~compile_program proc~create~13->proc~cudageterrorstring proc~destroy~11 nvrtc_module%destroy proc~create~13->proc~destroy~11 proc~get_code get_code proc~create~13->proc~get_code proc~create~13->proc~get_conf_log_enabled proc~create~13->proc~mem_alloc_host proc~create~13->proc~nvrtcgeterrorstring proc~create~13->proc~pop_nvtx_domain_range proc~create~13->proc~push_nvtx_domain_range proc~create~13->proc~write_message proc~create~13->fname proc~create~13->mpi_abort proc~create~14 vkfft_executor%create proc~create~14->proc~get_conf_platform proc~create~14->proc~get_conf_stream proc~load_vkfft load_vkfft proc~create~14->proc~load_vkfft proc~create~15 abstract_backend%create proc~create~15->interface~cudaeventcreatewithflags proc~create~15->interface~cudastreamcreate proc~create~15->proc~cudageterrorstring proc~create~15->proc~is_backend_mpi proc~is_backend_pipelined is_backend_pipelined proc~create~15->proc~is_backend_pipelined proc~create~15->create_private proc~create~15->fname proc~create~15->mpi_abort proc~create~15->mpi_comm_rank proc~create~15->mpi_comm_size proc~create~16 transpose_handle_datatype%create proc~create~16->proc~create_back_permutation proc~create~16->proc~create_forw_permutation proc~create~16->proc~create_handle proc~create~16->proc~create_transpose_2d proc~create~16->proc~create_transpose_xz proc~create~16->proc~create_transpose_zx proc~destroy~15 transpose_handle_datatype%destroy proc~create~16->proc~destroy~15 proc~create~16->mpi_allgather proc~create~16->mpi_comm_size proc~create~2 abstract_kernel%create proc~destroy~2 abstract_kernel%destroy proc~create~2->proc~destroy~2 proc~create~2->proc~get_kernel_string proc~create~2->proc~is_unpack_kernel proc~create~2->create_private proc~create~3 transpose_handle_generic%create proc~create~3->proc~check_if_overflow proc~create~3->proc~create~2 proc~destroy~3 transpose_handle_generic%destroy proc~create~3->proc~destroy~3 proc~is_backend_cufftmp is_backend_cufftmp proc~create~3->proc~is_backend_cufftmp proc~create~3->proc~is_backend_mpi proc~create~3->proc~is_backend_nccl proc~create~3->proc~is_backend_pipelined proc~set_unpack_kernel abstract_backend%set_unpack_kernel proc~create~3->proc~set_unpack_kernel proc~create~3->mpi_comm_rank proc~create~3->mpi_comm_size mpi_irecv mpi_irecv proc~create~3->mpi_irecv mpi_isend mpi_isend proc~create~3->mpi_isend mpi_wait mpi_wait proc~create~3->mpi_wait proc~create~4 backend_cufftmp%create proc~create~4->interface~comm_f2c proc~create~4->interface~cufftmpattachreshapecomm proc~create~4->interface~cufftmpcreatereshape proc~create~4->interface~cufftmpgetreshapesize proc~create~4->interface~cufftmpmakereshape proc~create~4->proc~cufftgeterrorstring proc~create~4->fname proc~create~4->mpi_abort proc~create~5->proc~check_if_even proc~create~5->proc~destroy~5 proc~get_local_size get_local_size proc~create~5->proc~get_local_size proc~create~6 abstract_executor%create proc~create~6->interface~to_str proc~create~6->proc~pop_nvtx_domain_range proc~create~6->proc~push_nvtx_domain_range proc~create~6->create_private proc~create~6->is_null_ptr proc~create~7 fftw_executor%create proc~get_inverse_kind get_inverse_kind proc~create~7->proc~get_inverse_kind proc~mem_alloc~2 fftw_executor%mem_alloc proc~create~7->proc~mem_alloc~2 proc~mem_free~2 fftw_executor%mem_free proc~create~7->proc~mem_free~2 constructor constructor proc~create~7->constructor constructor_inverse constructor_inverse proc~create~7->constructor_inverse inverse_kinds inverse_kinds proc~create~7->inverse_kinds knds knds proc~create~7->knds proc~create~8 kernel_device%create proc~create~8->interface~cudagetdevice proc~create~8->interface~get_device_props proc~create~8->proc~cudageterrorstring proc~create~8->proc~destroy~2 proc~get_kernel get_kernel proc~create~8->proc~get_kernel proc~create~8->fname proc~create~8->mpi_abort proc~cudageterrorstring->interface~cudageterrorstring_c proc~cudageterrorstring->proc~string_c2f proc~culaunchkernel cuLaunchKernel proc~destroy transpose_plan%destroy proc~destroy->proc~destroy_helper~2 proc~destroy_plans destroy_plans proc~destroy->proc~destroy_plans proc~mem_free transpose_plan%mem_free proc~destroy->proc~mem_free proc~destroy_data_handle data_handle%destroy_data_handle mpi_type_free mpi_type_free proc~destroy_handle->mpi_type_free proc~destroy_helper mpi_backend_helper%destroy_helper mpi_request_free mpi_request_free proc~destroy_helper->mpi_request_free proc~destroy_helper~2->interface~ncclcommdestroy proc~destroy_helper~2->proc~ncclgeterrorstring proc~destroy_helper~2->proc~write_message proc~destroy_helper~2->fname proc~destroy_helper~2->mpi_abort proc~destroy_host kernel_host%destroy_host proc~destroy_mpi backend_mpi%destroy_mpi proc~destroy_mpi->proc~destroy_helper proc~destroy_nccl backend_nccl%destroy_nccl proc~destroy_pencil_init pencil_init%destroy_pencil_init proc~destroy_pencil_init->mpi_comm_free proc~destroy_pencil_t destroy_pencil_t proc~destroy_pencil_t->proc~destroy_pencil_t_private destroy destroy proc~destroy_plans->destroy proc~destroy_stream destroy_stream proc~destroy_stream->interface~cudastreamdestroy proc~destroy_stream->proc~cudageterrorstring proc~destroy_stream->fname proc~destroy_stream->mpi_abort proc~destroy_string string%destroy_string proc~destroy_strings->proc~destroy_string proc~destroy~10 cufft_executor%destroy proc~destroy~10->interface~cufftdestroy proc~destroy~10->proc~cufftgeterrorstring proc~destroy~10->fname proc~destroy~10->mpi_abort proc~destroy~11->proc~cudageterrorstring proc~destroy~11->proc~nvrtcgeterrorstring proc~destroy~11->fname proc~destroy~11->is_null_ptr proc~destroy~11->mpi_abort proc~destroy~12 vkfft_executor%destroy proc~destroy~13 dtfft_plan_t%destroy proc~destroy~13->proc~destroy_stream proc~destroy~13->proc~dtfft_get_error_string proc~mem_free_ptr dtfft_plan_t%mem_free_ptr proc~destroy~13->proc~mem_free_ptr proc~destroy~13->proc~pop_nvtx_domain_range proc~destroy~13->proc~push_nvtx_domain_range proc~destroy~13->proc~write_message proc~destroy~13->destroy proc~destroy~13->mpi_comm_free mpi_finalized mpi_finalized proc~destroy~13->mpi_finalized proc~destroy~14 abstract_backend%destroy proc~destroy~14->interface~cudaeventdestroy proc~destroy~14->interface~cudastreamdestroy proc~destroy~14->proc~cudageterrorstring destroy_private destroy_private proc~destroy~14->destroy_private proc~destroy~14->fname proc~destroy~14->mpi_abort proc~destroy~15->proc~destroy_handle proc~destroy~15->mpi_request_free proc~destroy~2->destroy_private proc~destroy~3->proc~destroy~2 proc~destroy~4 backend_cufftmp%destroy proc~destroy~4->interface~cufftmpdestroyreshape proc~destroy~4->proc~cufftgeterrorstring proc~destroy~4->fname proc~destroy~4->mpi_abort proc~destroy~6 abstract_executor%destroy proc~destroy~6->destroy_private proc~destroy~7 fftw_executor%destroy proc~destroy~8 kernel_device%destroy proc~destroy~9 mkl_executor%destroy proc~destroy~9->interface~mkl_dfti_free_desc proc~destroy~9->interface~to_str proc~dftierrormessage DftiErrorMessage proc~destroy~9->proc~dftierrormessage proc~destroy~9->mpi_abort proc~dftierrormessage->interface~dftierrormessage_c proc~dftierrormessage->proc~string_c2f proc~dl_error dl_error proc~dl_error->interface~dlerror proc~dl_error->proc~string_c2f proc~dl_error->proc~write_message proc~dtfft_create_config dtfft_create_config proc~dtfft_create_plan_c2c_c dtfft_create_plan_c2c_c proc~get_comm get_comm proc~dtfft_create_plan_c2c_c->proc~get_comm proc~dtfft_create_plan_c2c_c->create proc~dtfft_create_plan_c2c_pencil_c dtfft_create_plan_c2c_pencil_c proc~dtfft_create_plan_c2c_pencil_c->proc~get_comm proc~pencil_c2f pencil_c2f proc~dtfft_create_plan_c2c_pencil_c->proc~pencil_c2f proc~dtfft_create_plan_c2c_pencil_c->create proc~dtfft_create_plan_r2r_c dtfft_create_plan_r2r_c proc~dtfft_create_plan_r2r_c->proc~get_comm proc~dtfft_create_plan_r2r_c->create proc~dtfft_create_plan_r2r_c->is_null_ptr proc~dtfft_create_plan_r2r_pencil_c dtfft_create_plan_r2r_pencil_c proc~dtfft_create_plan_r2r_pencil_c->proc~get_comm proc~dtfft_create_plan_r2r_pencil_c->proc~pencil_c2f proc~dtfft_create_plan_r2r_pencil_c->create proc~dtfft_create_plan_r2r_pencil_c->is_null_ptr proc~dtfft_destroy_c dtfft_destroy_c proc~dtfft_destroy_c->proc~destroy~13 proc~dtfft_destroy_c->is_null_ptr proc~dtfft_execute_c dtfft_execute_c proc~execute_ptr dtfft_plan_t%execute_ptr proc~dtfft_execute_c->proc~execute_ptr proc~dtfft_execute_c->is_null_ptr proc~dtfft_get_alloc_bytes_c dtfft_get_alloc_bytes_c proc~get_alloc_bytes dtfft_plan_t%get_alloc_bytes proc~dtfft_get_alloc_bytes_c->proc~get_alloc_bytes proc~dtfft_get_alloc_bytes_c->is_null_ptr proc~dtfft_get_alloc_size_c dtfft_get_alloc_size_c proc~dtfft_get_alloc_size_c->proc~get_alloc_size proc~dtfft_get_alloc_size_c->is_null_ptr proc~dtfft_get_backend_c dtfft_get_backend_c proc~get_backend~2 dtfft_plan_t%get_backend proc~dtfft_get_backend_c->proc~get_backend~2 proc~dtfft_get_backend_c->is_null_ptr proc~dtfft_get_backend_string_c dtfft_get_backend_string_c proc~dtfft_get_backend_string_c->proc~dtfft_get_backend_string proc~dtfft_get_backend_string_c->proc~string_f2c proc~dtfft_get_cuda_stream dtfft_get_cuda_stream proc~dtfft_get_dims_c dtfft_get_dims_c proc~get_dims dtfft_plan_t%get_dims proc~dtfft_get_dims_c->proc~get_dims proc~dtfft_get_dims_c->is_null_ptr proc~dtfft_get_element_size_c dtfft_get_element_size_c proc~dtfft_get_element_size_c->proc~get_element_size proc~dtfft_get_element_size_c->is_null_ptr proc~dtfft_get_error_string_c dtfft_get_error_string_c proc~dtfft_get_error_string_c->proc~dtfft_get_error_string proc~dtfft_get_error_string_c->proc~string_f2c proc~dtfft_get_executor_c dtfft_get_executor_c proc~get_executor dtfft_plan_t%get_executor proc~dtfft_get_executor_c->proc~get_executor proc~dtfft_get_executor_c->is_null_ptr proc~dtfft_get_executor_string dtfft_get_executor_string proc~dtfft_get_executor_string_c dtfft_get_executor_string_c proc~dtfft_get_executor_string_c->proc~dtfft_get_executor_string proc~dtfft_get_executor_string_c->proc~string_f2c proc~dtfft_get_grid_dims_c dtfft_get_grid_dims_c proc~get_grid_dims dtfft_plan_t%get_grid_dims proc~dtfft_get_grid_dims_c->proc~get_grid_dims proc~dtfft_get_grid_dims_c->is_null_ptr proc~dtfft_get_local_sizes_c dtfft_get_local_sizes_c proc~get_local_sizes~2 dtfft_plan_t%get_local_sizes proc~dtfft_get_local_sizes_c->proc~get_local_sizes~2 proc~dtfft_get_local_sizes_c->is_null_ptr proc~dtfft_get_pencil_c dtfft_get_pencil_c proc~get_pencil dtfft_plan_t%get_pencil proc~dtfft_get_pencil_c->proc~get_pencil proc~pencil_f2c pencil_f2c proc~dtfft_get_pencil_c->proc~pencil_f2c proc~dtfft_get_pencil_c->is_null_ptr proc~dtfft_get_platform_c dtfft_get_platform_c proc~get_platform dtfft_plan_t%get_platform proc~dtfft_get_platform_c->proc~get_platform proc~dtfft_get_platform_c->is_null_ptr proc~dtfft_get_precision_c dtfft_get_precision_c proc~get_precision dtfft_plan_t%get_precision proc~dtfft_get_precision_c->proc~get_precision proc~dtfft_get_precision_c->is_null_ptr proc~dtfft_get_precision_string dtfft_get_precision_string proc~dtfft_get_precision_string_c dtfft_get_precision_string_c proc~dtfft_get_precision_string_c->proc~dtfft_get_precision_string proc~dtfft_get_precision_string_c->proc~string_f2c proc~dtfft_get_stream_c dtfft_get_stream_c proc~dtfft_get_stream_c->none~get_stream proc~dtfft_get_stream_c->is_null_ptr proc~dtfft_get_y_slab_enabled_c dtfft_get_y_slab_enabled_c proc~get_y_slab_enabled dtfft_plan_t%get_y_slab_enabled proc~dtfft_get_y_slab_enabled_c->proc~get_y_slab_enabled proc~dtfft_get_y_slab_enabled_c->is_null_ptr proc~dtfft_get_z_slab_enabled_c dtfft_get_z_slab_enabled_c proc~get_z_slab_enabled dtfft_plan_t%get_z_slab_enabled proc~dtfft_get_z_slab_enabled_c->proc~get_z_slab_enabled proc~dtfft_get_z_slab_enabled_c->is_null_ptr proc~dtfft_mem_alloc_c dtfft_mem_alloc_c proc~dtfft_mem_alloc_c->proc~mem_alloc_ptr proc~dtfft_mem_alloc_c->is_null_ptr proc~dtfft_mem_free_c dtfft_mem_free_c proc~dtfft_mem_free_c->proc~mem_free_ptr proc~dtfft_mem_free_c->is_null_ptr proc~dtfft_report_c dtfft_report_c proc~report dtfft_plan_t%report proc~dtfft_report_c->proc~report proc~dtfft_report_c->is_null_ptr proc~dtfft_set_config dtfft_set_config proc~dtfft_set_config->interface~cudastreamquery proc~get_correct_backend get_correct_backend proc~dtfft_set_config->proc~get_correct_backend proc~dtfft_set_config->proc~init_internal proc~is_valid_backend is_valid_backend proc~dtfft_set_config->proc~is_valid_backend proc~is_valid_platform is_valid_platform proc~dtfft_set_config->proc~is_valid_platform proc~dtfft_set_config->is_null_ptr proc~dtfft_set_config_c dtfft_set_config_c proc~dtfft_set_config_c->proc~dtfft_set_config proc~dtfft_transpose_c dtfft_transpose_c proc~transpose_ptr dtfft_plan_t%transpose_ptr proc~dtfft_transpose_c->proc~transpose_ptr proc~dtfft_transpose_c->is_null_ptr proc~dtfft_transpose_end_c dtfft_transpose_end_c proc~transpose_end dtfft_plan_t%transpose_end proc~dtfft_transpose_end_c->proc~transpose_end proc~dtfft_transpose_end_c->is_null_ptr proc~dtfft_transpose_start_c dtfft_transpose_start_c proc~transpose_start_ptr dtfft_plan_t%transpose_start_ptr proc~dtfft_transpose_start_c->proc~transpose_start_ptr proc~dtfft_transpose_start_c->is_null_ptr proc~dynamic_load dynamic_load proc~dynamic_load->interface~is_null_ptr proc~load_library load_library proc~dynamic_load->proc~load_library proc~load_symbol load_symbol proc~dynamic_load->proc~load_symbol proc~unload_library unload_library proc~dynamic_load->proc~unload_library proc~estimate_bank_conflict_ratio estimate_bank_conflict_ratio proc~estimate_bank_conflict_ratio->proc~count_bank_conflicts proc~estimate_coalescing estimate_coalescing proc~estimate_memory_pressure estimate_memory_pressure proc~estimate_occupancy estimate_occupancy proc~estimate_optimal_padding estimate_optimal_padding proc~estimate_optimal_padding->proc~count_bank_conflicts proc~evaluate_analytical_performance evaluate_analytical_performance proc~evaluate_analytical_performance->proc~count_bank_conflicts proc~evaluate_analytical_performance->proc~estimate_bank_conflict_ratio proc~evaluate_analytical_performance->proc~estimate_coalescing proc~evaluate_analytical_performance->proc~estimate_occupancy proc~execute transpose_plan%execute proc~execute->proc~pop_nvtx_domain_range proc~execute->proc~push_nvtx_domain_range execute execute proc~execute->execute proc~execute_2d dtfft_plan_t%execute_2d proc~execute_2d->proc~execute proc~execute_a2a backend_mpi%execute_a2a mpi_alltoall_init mpi_alltoall_init proc~execute_a2a->mpi_alltoall_init mpi_alltoallv_init mpi_alltoallv_init proc~execute_a2a->mpi_alltoallv_init mpi_start mpi_start proc~execute_a2a->mpi_start proc~execute_benchmark->proc~pop_nvtx_domain_range proc~execute_benchmark->proc~push_nvtx_domain_range proc~execute_end transpose_plan%execute_end execute_end execute_end proc~execute_end->execute_end proc~execute_end_mpi backend_mpi%execute_end_mpi mpi_waitall mpi_waitall proc~execute_end_mpi->mpi_waitall mpi_win_fence mpi_win_fence proc~execute_end_mpi->mpi_win_fence proc~execute_end~2 transpose_handle_generic%execute_end proc~execute_end~3 abstract_backend%execute_end proc~execute_end~2->proc~execute_end~3 proc~execute~2 abstract_kernel%execute proc~execute_end~2->proc~execute~2 proc~execute_end~4 transpose_handle_datatype%execute_end proc~execute_end~4->mpi_waitall proc~execute_f128 execute_f128 proc~permute_backward_end_pipelined_read_f128 permute_backward_end_pipelined_read_f128 proc~execute_f128->proc~permute_backward_end_pipelined_read_f128 proc~permute_backward_end_pipelined_write_f128 permute_backward_end_pipelined_write_f128 proc~execute_f128->proc~permute_backward_end_pipelined_write_f128 proc~permute_backward_end_read_f128 permute_backward_end_read_f128 proc~execute_f128->proc~permute_backward_end_read_f128 proc~permute_backward_end_write_f128 permute_backward_end_write_f128 proc~execute_f128->proc~permute_backward_end_write_f128 proc~permute_backward_read_f128 permute_backward_read_f128 proc~execute_f128->proc~permute_backward_read_f128 proc~permute_backward_start_read_f128 permute_backward_start_read_f128 proc~execute_f128->proc~permute_backward_start_read_f128 proc~permute_backward_start_write_f128 permute_backward_start_write_f128 proc~execute_f128->proc~permute_backward_start_write_f128 proc~permute_backward_write_f128 permute_backward_write_f128 proc~execute_f128->proc~permute_backward_write_f128 proc~permute_forward_read_f128 permute_forward_read_f128 proc~execute_f128->proc~permute_forward_read_f128 proc~permute_forward_write_f128 permute_forward_write_f128 proc~execute_f128->proc~permute_forward_write_f128 proc~unpack_f128 unpack_f128 proc~execute_f128->proc~unpack_f128 proc~unpack_pipelined_f128 unpack_pipelined_f128 proc~execute_f128->proc~unpack_pipelined_f128 proc~execute_f128_block_16 execute_f128_block_16 proc~permute_backward_end_pipelined_read_f128_block_16 permute_backward_end_pipelined_read_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_end_pipelined_read_f128_block_16 proc~permute_backward_end_pipelined_write_f128_block_16 permute_backward_end_pipelined_write_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_end_pipelined_write_f128_block_16 proc~permute_backward_end_read_f128_block_16 permute_backward_end_read_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_end_read_f128_block_16 proc~permute_backward_end_write_f128_block_16 permute_backward_end_write_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_end_write_f128_block_16 proc~permute_backward_read_f128_block_16 permute_backward_read_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_read_f128_block_16 proc~permute_backward_start_read_f128_block_16 permute_backward_start_read_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_start_read_f128_block_16 proc~permute_backward_start_write_f128_block_16 permute_backward_start_write_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_start_write_f128_block_16 proc~permute_backward_write_f128_block_16 permute_backward_write_f128_block_16 proc~execute_f128_block_16->proc~permute_backward_write_f128_block_16 proc~permute_forward_read_f128_block_16 permute_forward_read_f128_block_16 proc~execute_f128_block_16->proc~permute_forward_read_f128_block_16 proc~permute_forward_write_f128_block_16 permute_forward_write_f128_block_16 proc~execute_f128_block_16->proc~permute_forward_write_f128_block_16 proc~unpack_f128_block_16 unpack_f128_block_16 proc~execute_f128_block_16->proc~unpack_f128_block_16 proc~unpack_pipelined_f128_block_16 unpack_pipelined_f128_block_16 proc~execute_f128_block_16->proc~unpack_pipelined_f128_block_16 proc~execute_f128_block_32 execute_f128_block_32 proc~permute_backward_end_pipelined_read_f128_block_32 permute_backward_end_pipelined_read_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_end_pipelined_read_f128_block_32 proc~permute_backward_end_pipelined_write_f128_block_32 permute_backward_end_pipelined_write_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_end_pipelined_write_f128_block_32 proc~permute_backward_end_read_f128_block_32 permute_backward_end_read_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_end_read_f128_block_32 proc~permute_backward_end_write_f128_block_32 permute_backward_end_write_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_end_write_f128_block_32 proc~permute_backward_read_f128_block_32 permute_backward_read_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_read_f128_block_32 proc~permute_backward_start_read_f128_block_32 permute_backward_start_read_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_start_read_f128_block_32 proc~permute_backward_start_write_f128_block_32 permute_backward_start_write_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_start_write_f128_block_32 proc~permute_backward_write_f128_block_32 permute_backward_write_f128_block_32 proc~execute_f128_block_32->proc~permute_backward_write_f128_block_32 proc~permute_forward_read_f128_block_32 permute_forward_read_f128_block_32 proc~execute_f128_block_32->proc~permute_forward_read_f128_block_32 proc~permute_forward_write_f128_block_32 permute_forward_write_f128_block_32 proc~execute_f128_block_32->proc~permute_forward_write_f128_block_32 proc~unpack_f128_block_32 unpack_f128_block_32 proc~execute_f128_block_32->proc~unpack_f128_block_32 proc~unpack_pipelined_f128_block_32 unpack_pipelined_f128_block_32 proc~execute_f128_block_32->proc~unpack_pipelined_f128_block_32 proc~execute_f128_block_64 execute_f128_block_64 proc~permute_backward_end_pipelined_read_f128_block_64 permute_backward_end_pipelined_read_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_end_pipelined_read_f128_block_64 proc~permute_backward_end_pipelined_write_f128_block_64 permute_backward_end_pipelined_write_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_end_pipelined_write_f128_block_64 proc~permute_backward_end_read_f128_block_64 permute_backward_end_read_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_end_read_f128_block_64 proc~permute_backward_end_write_f128_block_64 permute_backward_end_write_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_end_write_f128_block_64 proc~permute_backward_read_f128_block_64 permute_backward_read_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_read_f128_block_64 proc~permute_backward_start_read_f128_block_64 permute_backward_start_read_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_start_read_f128_block_64 proc~permute_backward_start_write_f128_block_64 permute_backward_start_write_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_start_write_f128_block_64 proc~permute_backward_write_f128_block_64 permute_backward_write_f128_block_64 proc~execute_f128_block_64->proc~permute_backward_write_f128_block_64 proc~permute_forward_read_f128_block_64 permute_forward_read_f128_block_64 proc~execute_f128_block_64->proc~permute_forward_read_f128_block_64 proc~permute_forward_write_f128_block_64 permute_forward_write_f128_block_64 proc~execute_f128_block_64->proc~permute_forward_write_f128_block_64 proc~unpack_f128_block_64 unpack_f128_block_64 proc~execute_f128_block_64->proc~unpack_f128_block_64 proc~unpack_pipelined_f128_block_64 unpack_pipelined_f128_block_64 proc~execute_f128_block_64->proc~unpack_pipelined_f128_block_64 proc~execute_f32 execute_f32 proc~permute_backward_end_pipelined_read_f32 permute_backward_end_pipelined_read_f32 proc~execute_f32->proc~permute_backward_end_pipelined_read_f32 proc~permute_backward_end_pipelined_write_f32 permute_backward_end_pipelined_write_f32 proc~execute_f32->proc~permute_backward_end_pipelined_write_f32 proc~permute_backward_end_read_f32 permute_backward_end_read_f32 proc~execute_f32->proc~permute_backward_end_read_f32 proc~permute_backward_end_write_f32 permute_backward_end_write_f32 proc~execute_f32->proc~permute_backward_end_write_f32 proc~permute_backward_read_f32 permute_backward_read_f32 proc~execute_f32->proc~permute_backward_read_f32 proc~permute_backward_start_read_f32 permute_backward_start_read_f32 proc~execute_f32->proc~permute_backward_start_read_f32 proc~permute_backward_start_write_f32 permute_backward_start_write_f32 proc~execute_f32->proc~permute_backward_start_write_f32 proc~permute_backward_write_f32 permute_backward_write_f32 proc~execute_f32->proc~permute_backward_write_f32 proc~permute_forward_read_f32 permute_forward_read_f32 proc~execute_f32->proc~permute_forward_read_f32 proc~permute_forward_write_f32 permute_forward_write_f32 proc~execute_f32->proc~permute_forward_write_f32 proc~unpack_f32 unpack_f32 proc~execute_f32->proc~unpack_f32 proc~unpack_pipelined_f32 unpack_pipelined_f32 proc~execute_f32->proc~unpack_pipelined_f32 proc~execute_f32_block_16 execute_f32_block_16 proc~permute_backward_end_pipelined_read_f32_block_16 permute_backward_end_pipelined_read_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_end_pipelined_read_f32_block_16 proc~permute_backward_end_pipelined_write_f32_block_16 permute_backward_end_pipelined_write_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_end_pipelined_write_f32_block_16 proc~permute_backward_end_read_f32_block_16 permute_backward_end_read_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_end_read_f32_block_16 proc~permute_backward_end_write_f32_block_16 permute_backward_end_write_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_end_write_f32_block_16 proc~permute_backward_read_f32_block_16 permute_backward_read_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_read_f32_block_16 proc~permute_backward_start_read_f32_block_16 permute_backward_start_read_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_start_read_f32_block_16 proc~permute_backward_start_write_f32_block_16 permute_backward_start_write_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_start_write_f32_block_16 proc~permute_backward_write_f32_block_16 permute_backward_write_f32_block_16 proc~execute_f32_block_16->proc~permute_backward_write_f32_block_16 proc~permute_forward_read_f32_block_16 permute_forward_read_f32_block_16 proc~execute_f32_block_16->proc~permute_forward_read_f32_block_16 proc~permute_forward_write_f32_block_16 permute_forward_write_f32_block_16 proc~execute_f32_block_16->proc~permute_forward_write_f32_block_16 proc~unpack_f32_block_16 unpack_f32_block_16 proc~execute_f32_block_16->proc~unpack_f32_block_16 proc~unpack_pipelined_f32_block_16 unpack_pipelined_f32_block_16 proc~execute_f32_block_16->proc~unpack_pipelined_f32_block_16 proc~execute_f32_block_32 execute_f32_block_32 proc~permute_backward_end_pipelined_read_f32_block_32 permute_backward_end_pipelined_read_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_end_pipelined_read_f32_block_32 proc~permute_backward_end_pipelined_write_f32_block_32 permute_backward_end_pipelined_write_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_end_pipelined_write_f32_block_32 proc~permute_backward_end_read_f32_block_32 permute_backward_end_read_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_end_read_f32_block_32 proc~permute_backward_end_write_f32_block_32 permute_backward_end_write_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_end_write_f32_block_32 proc~permute_backward_read_f32_block_32 permute_backward_read_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_read_f32_block_32 proc~permute_backward_start_read_f32_block_32 permute_backward_start_read_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_start_read_f32_block_32 proc~permute_backward_start_write_f32_block_32 permute_backward_start_write_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_start_write_f32_block_32 proc~permute_backward_write_f32_block_32 permute_backward_write_f32_block_32 proc~execute_f32_block_32->proc~permute_backward_write_f32_block_32 proc~permute_forward_read_f32_block_32 permute_forward_read_f32_block_32 proc~execute_f32_block_32->proc~permute_forward_read_f32_block_32 proc~permute_forward_write_f32_block_32 permute_forward_write_f32_block_32 proc~execute_f32_block_32->proc~permute_forward_write_f32_block_32 proc~unpack_f32_block_32 unpack_f32_block_32 proc~execute_f32_block_32->proc~unpack_f32_block_32 proc~unpack_pipelined_f32_block_32 unpack_pipelined_f32_block_32 proc~execute_f32_block_32->proc~unpack_pipelined_f32_block_32 proc~execute_f32_block_64 execute_f32_block_64 proc~permute_backward_end_pipelined_read_f32_block_64 permute_backward_end_pipelined_read_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_end_pipelined_read_f32_block_64 proc~permute_backward_end_pipelined_write_f32_block_64 permute_backward_end_pipelined_write_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_end_pipelined_write_f32_block_64 proc~permute_backward_end_read_f32_block_64 permute_backward_end_read_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_end_read_f32_block_64 proc~permute_backward_end_write_f32_block_64 permute_backward_end_write_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_end_write_f32_block_64 proc~permute_backward_read_f32_block_64 permute_backward_read_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_read_f32_block_64 proc~permute_backward_start_read_f32_block_64 permute_backward_start_read_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_start_read_f32_block_64 proc~permute_backward_start_write_f32_block_64 permute_backward_start_write_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_start_write_f32_block_64 proc~permute_backward_write_f32_block_64 permute_backward_write_f32_block_64 proc~execute_f32_block_64->proc~permute_backward_write_f32_block_64 proc~permute_forward_read_f32_block_64 permute_forward_read_f32_block_64 proc~execute_f32_block_64->proc~permute_forward_read_f32_block_64 proc~permute_forward_write_f32_block_64 permute_forward_write_f32_block_64 proc~execute_f32_block_64->proc~permute_forward_write_f32_block_64 proc~unpack_f32_block_64 unpack_f32_block_64 proc~execute_f32_block_64->proc~unpack_f32_block_64 proc~unpack_pipelined_f32_block_64 unpack_pipelined_f32_block_64 proc~execute_f32_block_64->proc~unpack_pipelined_f32_block_64 proc~execute_f64 execute_f64 proc~permute_backward_end_pipelined_read_f64 permute_backward_end_pipelined_read_f64 proc~execute_f64->proc~permute_backward_end_pipelined_read_f64 proc~permute_backward_end_pipelined_write_f64 permute_backward_end_pipelined_write_f64 proc~execute_f64->proc~permute_backward_end_pipelined_write_f64 proc~permute_backward_end_read_f64 permute_backward_end_read_f64 proc~execute_f64->proc~permute_backward_end_read_f64 proc~permute_backward_end_write_f64 permute_backward_end_write_f64 proc~execute_f64->proc~permute_backward_end_write_f64 proc~permute_backward_read_f64 permute_backward_read_f64 proc~execute_f64->proc~permute_backward_read_f64 proc~permute_backward_start_read_f64 permute_backward_start_read_f64 proc~execute_f64->proc~permute_backward_start_read_f64 proc~permute_backward_start_write_f64 permute_backward_start_write_f64 proc~execute_f64->proc~permute_backward_start_write_f64 proc~permute_backward_write_f64 permute_backward_write_f64 proc~execute_f64->proc~permute_backward_write_f64 proc~permute_forward_read_f64 permute_forward_read_f64 proc~execute_f64->proc~permute_forward_read_f64 proc~permute_forward_write_f64 permute_forward_write_f64 proc~execute_f64->proc~permute_forward_write_f64 proc~unpack_f64 unpack_f64 proc~execute_f64->proc~unpack_f64 proc~unpack_pipelined_f64 unpack_pipelined_f64 proc~execute_f64->proc~unpack_pipelined_f64 proc~execute_f64_block_16 execute_f64_block_16 proc~permute_backward_end_pipelined_read_f64_block_16 permute_backward_end_pipelined_read_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_end_pipelined_read_f64_block_16 proc~permute_backward_end_pipelined_write_f64_block_16 permute_backward_end_pipelined_write_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_end_pipelined_write_f64_block_16 proc~permute_backward_end_read_f64_block_16 permute_backward_end_read_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_end_read_f64_block_16 proc~permute_backward_end_write_f64_block_16 permute_backward_end_write_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_end_write_f64_block_16 proc~permute_backward_read_f64_block_16 permute_backward_read_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_read_f64_block_16 proc~permute_backward_start_read_f64_block_16 permute_backward_start_read_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_start_read_f64_block_16 proc~permute_backward_start_write_f64_block_16 permute_backward_start_write_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_start_write_f64_block_16 proc~permute_backward_write_f64_block_16 permute_backward_write_f64_block_16 proc~execute_f64_block_16->proc~permute_backward_write_f64_block_16 proc~permute_forward_read_f64_block_16 permute_forward_read_f64_block_16 proc~execute_f64_block_16->proc~permute_forward_read_f64_block_16 proc~permute_forward_write_f64_block_16 permute_forward_write_f64_block_16 proc~execute_f64_block_16->proc~permute_forward_write_f64_block_16 proc~unpack_f64_block_16 unpack_f64_block_16 proc~execute_f64_block_16->proc~unpack_f64_block_16 proc~unpack_pipelined_f64_block_16 unpack_pipelined_f64_block_16 proc~execute_f64_block_16->proc~unpack_pipelined_f64_block_16 proc~execute_f64_block_32 execute_f64_block_32 proc~permute_backward_end_pipelined_read_f64_block_32 permute_backward_end_pipelined_read_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_end_pipelined_read_f64_block_32 proc~permute_backward_end_pipelined_write_f64_block_32 permute_backward_end_pipelined_write_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_end_pipelined_write_f64_block_32 proc~permute_backward_end_read_f64_block_32 permute_backward_end_read_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_end_read_f64_block_32 proc~permute_backward_end_write_f64_block_32 permute_backward_end_write_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_end_write_f64_block_32 proc~permute_backward_read_f64_block_32 permute_backward_read_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_read_f64_block_32 proc~permute_backward_start_read_f64_block_32 permute_backward_start_read_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_start_read_f64_block_32 proc~permute_backward_start_write_f64_block_32 permute_backward_start_write_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_start_write_f64_block_32 proc~permute_backward_write_f64_block_32 permute_backward_write_f64_block_32 proc~execute_f64_block_32->proc~permute_backward_write_f64_block_32 proc~permute_forward_read_f64_block_32 permute_forward_read_f64_block_32 proc~execute_f64_block_32->proc~permute_forward_read_f64_block_32 proc~permute_forward_write_f64_block_32 permute_forward_write_f64_block_32 proc~execute_f64_block_32->proc~permute_forward_write_f64_block_32 proc~unpack_f64_block_32 unpack_f64_block_32 proc~execute_f64_block_32->proc~unpack_f64_block_32 proc~unpack_pipelined_f64_block_32 unpack_pipelined_f64_block_32 proc~execute_f64_block_32->proc~unpack_pipelined_f64_block_32 proc~execute_f64_block_64 execute_f64_block_64 proc~permute_backward_end_pipelined_read_f64_block_64 permute_backward_end_pipelined_read_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_end_pipelined_read_f64_block_64 proc~permute_backward_end_pipelined_write_f64_block_64 permute_backward_end_pipelined_write_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_end_pipelined_write_f64_block_64 proc~permute_backward_end_read_f64_block_64 permute_backward_end_read_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_end_read_f64_block_64 proc~permute_backward_end_write_f64_block_64 permute_backward_end_write_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_end_write_f64_block_64 proc~permute_backward_read_f64_block_64 permute_backward_read_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_read_f64_block_64 proc~permute_backward_start_read_f64_block_64 permute_backward_start_read_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_start_read_f64_block_64 proc~permute_backward_start_write_f64_block_64 permute_backward_start_write_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_start_write_f64_block_64 proc~permute_backward_write_f64_block_64 permute_backward_write_f64_block_64 proc~execute_f64_block_64->proc~permute_backward_write_f64_block_64 proc~permute_forward_read_f64_block_64 permute_forward_read_f64_block_64 proc~execute_f64_block_64->proc~permute_forward_read_f64_block_64 proc~permute_forward_write_f64_block_64 permute_forward_write_f64_block_64 proc~execute_f64_block_64->proc~permute_forward_write_f64_block_64 proc~unpack_f64_block_64 unpack_f64_block_64 proc~execute_f64_block_64->proc~unpack_f64_block_64 proc~unpack_pipelined_f64_block_64 unpack_pipelined_f64_block_64 proc~execute_f64_block_64->proc~unpack_pipelined_f64_block_64 proc~execute_generic dtfft_plan_t%execute_generic proc~execute_generic->proc~execute proc~execute_host kernel_host%execute_host proc~execute_mpi backend_mpi%execute_mpi proc~execute_mpi->interface~cudastreamsynchronize proc~execute_mpi->proc~cudageterrorstring proc~execute_mpi->proc~execute_a2a proc~execute_p2p backend_mpi%execute_p2p proc~execute_mpi->proc~execute_p2p proc~execute_p2p_scheduled backend_mpi%execute_p2p_scheduled proc~execute_mpi->proc~execute_p2p_scheduled proc~execute_self_copy abstract_backend%execute_self_copy proc~execute_mpi->proc~execute_self_copy proc~execute_mpi->proc~execute~2 proc~execute_mpi->fname proc~execute_mpi->mpi_abort proc~execute_mpi->mpi_waitall mpi_waitsome mpi_waitsome proc~execute_mpi->mpi_waitsome proc~execute_mpi->mpi_win_fence proc~execute_nccl backend_nccl%execute_nccl proc~execute_nccl->interface~ncclgroupend proc~execute_nccl->interface~ncclgroupstart proc~execute_nccl->interface~ncclrecv proc~execute_nccl->interface~ncclsend proc~execute_nccl->proc~execute~2 proc~execute_nccl->proc~ncclgeterrorstring proc~execute_nccl->fname proc~execute_nccl->mpi_abort mpi_recv_init mpi_recv_init proc~execute_p2p->mpi_recv_init mpi_send_init mpi_send_init proc~execute_p2p->mpi_send_init mpi_startall mpi_startall proc~execute_p2p->mpi_startall proc~execute_p2p_scheduled->proc~execute_self_copy mpi_sendrecv mpi_sendrecv proc~execute_p2p_scheduled->mpi_sendrecv proc~execute_private dtfft_plan_t%execute_private proc~execute_private->proc~execute_2d proc~execute_private->proc~execute_generic proc~execute_z_slab dtfft_plan_t%execute_z_slab proc~execute_private->proc~execute_z_slab proc~execute_ptr->proc~check_aux proc~execute_ptr->proc~execute_private proc~is_same_ptr is_same_ptr proc~execute_ptr->proc~is_same_ptr proc~execute_ptr->proc~pop_nvtx_domain_range proc~execute_ptr->proc~push_nvtx_domain_range proc~execute_ptr->is_null_ptr proc~execute_self_copy->interface~cudaeventrecord proc~execute_self_copy->interface~cudamemcpyasync proc~execute_self_copy->interface~cudastreamwaitevent proc~execute_self_copy->proc~cudageterrorstring proc~execute_self_copy->fname proc~execute_self_copy->mpi_abort proc~execute_z_slab->proc~execute proc~execute_z_slab->proc~execute_generic proc~execute~10 vkfft_executor%execute proc~execute~11 dtfft_plan_t%execute proc~execute~11->proc~execute_ptr proc~execute~12 abstract_backend%execute proc~execute~12->interface~cudaeventrecord proc~execute~12->interface~cudastreamsynchronize proc~execute~12->interface~cudastreamwaitevent proc~execute~12->proc~cudageterrorstring proc~execute~12->proc~execute_end~3 proc~execute~12->proc~execute_self_copy proc~execute~12->proc~execute~2 execute_private execute_private proc~execute~12->execute_private proc~execute~12->fname proc~execute~12->mpi_abort proc~execute~13 transpose_handle_datatype%execute proc~execute~13->proc~execute_end~4 proc~execute~13->mpi_alltoall_init mpi_alltoallw_init mpi_alltoallw_init proc~execute~13->mpi_alltoallw_init proc~execute~13->mpi_comm_size proc~execute~13->mpi_startall proc~execute~2->proc~pop_nvtx_domain_range proc~execute~2->proc~push_nvtx_domain_range proc~execute~2->execute_private proc~execute~3 transpose_handle_generic%execute proc~execute~3->proc~execute~2 proc~execute~4 backend_cufftmp%execute proc~execute~4->interface~cudastreamsynchronize proc~execute~4->interface~cufftmpexecreshapeasync proc~execute~4->proc~cudageterrorstring proc~execute~4->proc~cufftgeterrorstring proc~execute~4->fname proc~execute~4->mpi_abort mpi_barrier mpi_barrier proc~execute~4->mpi_barrier proc~execute~5 abstract_executor%execute proc~execute~5->proc~pop_nvtx_domain_range proc~execute~5->proc~push_nvtx_domain_range proc~execute~5->execute_private proc~execute~6 fftw_executor%execute proc~execute~7 kernel_device%execute proc~execute~7->interface~cudamemcpyasync proc~execute~7->interface~cudastreamsynchronize proc~execute~7->proc~cudageterrorstring proc~execute~7->proc~culaunchkernel proc~get_kernel_args get_kernel_args proc~execute~7->proc~get_kernel_args proc~get_kernel_launch_params get_kernel_launch_params proc~execute~7->proc~get_kernel_launch_params proc~execute~7->fname proc~execute~7->mpi_abort proc~execute~8 mkl_executor%execute proc~execute~8->interface~mkl_dfti_commit_desc proc~execute~8->interface~mkl_dfti_execute proc~execute~8->interface~mkl_dfti_set_value proc~execute~8->interface~to_str proc~execute~8->proc~dftierrormessage proc~execute~8->mpi_abort proc~execute~9 cufft_executor%execute proc~execute~9->interface~cufftxtexec proc~execute~9->proc~cufftgeterrorstring proc~execute~9->fname proc~execute~9->mpi_abort proc~find_valid_combination find_valid_combination proc~free_datatypes->mpi_type_free proc~free_mem free_mem proc~free_mem->interface~cudafree proc~free_mem->interface~mem_free_host proc~free_mem->interface~ncclcommderegister proc~free_mem->interface~ncclmemfree proc~free_mem->interface~to_str proc~free_mem->proc~get_conf_log_enabled proc~free_mem->proc~is_backend_nccl proc~free_mem->proc~is_backend_nvshmem proc~free_mem->proc~is_same_ptr proc~free_mem->proc~ncclgeterrorstring proc~free_mem->proc~write_message proc~free_mem->fname proc~free_mem->mpi_abort nvshmem_free nvshmem_free proc~free_mem->nvshmem_free proc~generate_candidates generate_candidates proc~generate_candidates->proc~estimate_memory_pressure proc~generate_candidates->proc~estimate_optimal_padding proc~generate_candidates->proc~find_valid_combination proc~get nvrtc_module%get proc~get->none~check proc~get->proc~cudageterrorstring proc~get_mangled_name get_mangled_name proc~get->proc~get_mangled_name proc~get->fname proc~get->mpi_abort proc~get_alloc_bytes->proc~dtfft_get_error_string proc~get_alloc_bytes->proc~get_alloc_size proc~get_alloc_bytes->proc~get_element_size proc~get_alloc_bytes->proc~write_message proc~get_alloc_size->proc~get_local_sizes~2 proc~get_ampere_architecture get_ampere_architecture proc~get_async_active transpose_plan%get_async_active get_async_active get_async_active proc~get_async_active->get_async_active proc~get_async_active~2 transpose_handle_generic%get_async_active proc~get_async_active~4 abstract_backend%get_async_active proc~get_async_active~2->proc~get_async_active~4 proc~get_async_active~3 backend_mpi%get_async_active proc~get_async_active~5 transpose_handle_datatype%get_async_active proc~get_aux_size transpose_plan%get_aux_size proc~get_aux_size->proc~get_aux_size_generic proc~get_aux_size~3 abstract_transpose_handle%get_aux_size proc~get_aux_size_generic->proc~get_aux_size~3 proc~get_aux_size~2 transpose_handle_generic%get_aux_size proc~get_aux_size~4 abstract_backend%get_aux_size proc~get_aux_size~2->proc~get_aux_size~4 proc~get_backend transpose_plan%get_backend proc~get_backend~2->proc~dtfft_get_error_string proc~get_backend~2->proc~get_backend proc~get_backend~2->proc~write_message proc~get_code->proc~add_line proc~get_code->proc~is_unpack_kernel proc~get_conf_backend->proc~get_correct_backend proc~get_conf_configs_to_test->interface~get_conf_internal proc~get_conf_datatype_enabled->interface~get_conf_internal proc~get_conf_forced_kernel_optimization->interface~get_conf_internal proc~get_conf_kernel_optimization_enabled->interface~get_conf_internal proc~get_conf_log_enabled->interface~get_conf_internal proc~get_conf_measure_iters->interface~get_conf_internal proc~get_conf_measure_warmup_iters->interface~get_conf_internal proc~get_conf_mpi_enabled->interface~get_conf_internal proc~get_conf_nccl_enabled->interface~get_conf_internal proc~get_conf_nvshmem_enabled->interface~get_conf_internal proc~get_conf_pipelined_enabled get_conf_pipelined_enabled proc~get_conf_pipelined_enabled->interface~get_conf_internal proc~get_conf_stream->interface~cudastreamcreate proc~get_conf_stream->proc~cudageterrorstring proc~get_conf_stream->fname proc~get_conf_stream->mpi_abort proc~get_conf_y_slab_enabled->interface~get_conf_internal proc~get_conf_z_slab_enabled->interface~get_conf_internal proc~get_correct_backend->proc~get_conf_platform proc~get_datatype_from_env->interface~get_env proc~get_dims->proc~dtfft_get_error_string proc~get_dims->proc~write_message proc~get_element_size->proc~dtfft_get_error_string proc~get_element_size->proc~write_message proc~get_env_base->proc~destroy_string proc~get_env_int32->interface~get_env proc~get_env_int32->proc~write_message proc~get_env_int8->interface~get_env proc~get_env_logical->interface~get_env proc~get_env_string->interface~get_env proc~get_env_string->proc~destroy_string proc~get_env_string->proc~write_message proc~get_executor->proc~dtfft_get_error_string proc~get_executor->proc~write_message proc~get_grid_dims->proc~dtfft_get_error_string proc~get_grid_dims->proc~write_message proc~get_kernel->interface~cudaeventcreate proc~get_kernel->interface~cudaeventdestroy proc~get_kernel->interface~cudaeventelapsedtime proc~get_kernel->interface~cudaeventrecord proc~get_kernel->interface~cudaeventsynchronize proc~get_kernel->interface~cudafree proc~get_kernel->interface~cudamalloc proc~get_kernel->interface~cudastreamsynchronize proc~get_kernel->interface~to_str proc~get_kernel->proc~create_nvrtc_module proc~get_kernel->proc~cudageterrorstring proc~get_kernel->proc~culaunchkernel proc~get_kernel->proc~evaluate_analytical_performance proc~get_kernel->proc~generate_candidates proc~get_kernel->proc~get_conf_configs_to_test proc~get_kernel->proc~get_conf_forced_kernel_optimization proc~get_kernel->proc~get_conf_kernel_optimization_enabled proc~get_kernel->proc~get_conf_log_enabled proc~get_kernel->proc~get_conf_measure_iters proc~get_kernel->proc~get_conf_measure_warmup_iters proc~get_kernel->proc~get_conf_stream proc~get_kernel->proc~get_kernel_args proc~get_kernel_instance get_kernel_instance proc~get_kernel->proc~get_kernel_instance proc~get_kernel->proc~get_kernel_launch_params proc~get_kernel->proc~get_kernel_string proc~get_kernel->proc~is_unpack_kernel proc~get_kernel->proc~pop_nvtx_domain_range proc~get_kernel->proc~push_nvtx_domain_range proc~sort_candidates_by_score sort_candidates_by_score proc~get_kernel->proc~sort_candidates_by_score proc~get_kernel->proc~write_message proc~get_kernel->fname proc~get_kernel->mpi_abort proc~get_kernel_instance->interface~to_str proc~get_kernel_instance->proc~get proc~get_kernel_instance->proc~get_conf_log_enabled proc~get_kernel_instance->proc~get_kernel_string proc~get_kernel_instance->proc~write_message proc~get_kernel_instance->is_null_ptr proc~get_local_size->mpi_allgather proc~get_local_size->mpi_comm_rank proc~get_local_size->mpi_comm_size proc~get_local_sizes~2->proc~dtfft_get_error_string proc~get_local_sizes~2->proc~get_aux_size proc~get_local_sizes~2->proc~get_backend proc~get_local_sizes~2->proc~get_local_sizes proc~get_local_sizes~2->proc~is_backend_nvshmem proc~get_local_sizes~2->proc~write_message proc~get_local_sizes~2->counts proc~get_local_sizes~2->mpi_allreduce starts starts proc~get_local_sizes~2->starts proc~get_name_expression get_name_expression proc~get_mangled_name->proc~get_name_expression proc~get_mangled_name->proc~nvrtcgeterrorstring proc~get_mangled_name->fname proc~get_mangled_name->mpi_abort proc~get_name_expression->interface~to_str proc~get_name_expression->proc~astring_f2c proc~get_pencil->proc~dtfft_get_error_string proc~get_pencil->proc~write_message make_public make_public proc~get_pencil->make_public proc~get_plan_execution_time->interface~to_str proc~get_plan_execution_time->proc~create~10 proc~get_plan_execution_time->proc~destroy~15 proc~get_plan_execution_time->proc~execute~13 proc~get_plan_execution_time->proc~get_conf_log_enabled proc~get_plan_execution_time->proc~get_conf_measure_iters proc~get_plan_execution_time->proc~get_conf_measure_warmup_iters proc~get_plan_execution_time->proc~pop_nvtx_domain_range proc~get_plan_execution_time->proc~push_nvtx_domain_range proc~report_timings report_timings proc~get_plan_execution_time->proc~report_timings proc~get_plan_execution_time->proc~write_message proc~get_plan_execution_time->mpi_barrier proc~get_plan_execution_time->mpi_wtime proc~get_platform->proc~dtfft_get_error_string proc~get_platform->proc~write_message proc~get_precision->proc~dtfft_get_error_string proc~get_precision->proc~write_message proc~get_stream_int64->none~get_stream proc~get_stream_int64->proc~dtfft_get_cuda_stream proc~get_stream_ptr->proc~dtfft_get_error_string proc~get_stream_ptr->proc~write_message proc~get_volta_architecture get_volta_architecture proc~get_y_slab_enabled->proc~dtfft_get_error_string proc~get_y_slab_enabled->proc~write_message proc~get_z_slab_enabled->proc~dtfft_get_error_string proc~get_z_slab_enabled->proc~write_message proc~init_environment init_environment proc~init_environment->interface~get_env proc~init_environment->proc~destroy_strings proc~init_environment->proc~is_valid_backend proc~init_environment->proc~write_message backends backends proc~init_environment->backends proc~init_environment->destroy platforms platforms proc~init_environment->platforms proc~init_internal->proc~init_environment mpi_initialized mpi_initialized proc~init_internal->mpi_initialized proc~is_null_ptr is_null_ptr proc~is_nvshmem_ptr->interface~nvshmem_my_pe proc~is_nvshmem_ptr->interface~nvshmem_ptr proc~is_nvshmem_ptr->is_null_ptr proc~is_transpose_kernel is_transpose_kernel proc~is_valid_execute_type is_valid_execute_type proc~is_valid_transpose_type is_valid_transpose_type proc~load load proc~load->proc~destroy_strings proc~load->proc~dynamic_load proc~load_cuda->proc~destroy_strings proc~load_cuda->proc~dynamic_load proc~load_library->interface~dlopen proc~load_library->interface~is_null_ptr proc~load_library->proc~astring_f2c proc~load_library->proc~dl_error proc~load_nvrtc->proc~destroy_strings proc~load_nvrtc->proc~dynamic_load proc~load_symbol->interface~dlsym proc~load_symbol->interface~is_null_ptr proc~load_symbol->proc~astring_f2c proc~load_symbol->proc~dl_error proc~load_vkfft->proc~load proc~make_plan->interface~mkl_dfti_commit_desc proc~make_plan->interface~mkl_dfti_create_desc proc~make_plan->interface~mkl_dfti_set_value proc~make_plan->interface~to_str proc~make_plan->proc~dftierrormessage proc~make_plan->mpi_abort proc~make_public pencil%make_public proc~mem_alloc transpose_plan%mem_alloc proc~mem_alloc->proc~alloc_mem proc~mem_alloc_c32_1d->proc~mem_alloc_ptr proc~mem_alloc_c32_2d->proc~mem_alloc_ptr proc~mem_alloc_c32_3d->proc~mem_alloc_ptr proc~mem_alloc_c64_1d->proc~mem_alloc_ptr proc~mem_alloc_c64_2d->proc~mem_alloc_ptr proc~mem_alloc_c64_3d->proc~mem_alloc_ptr proc~mem_alloc_host->interface~aligned_alloc proc~mem_alloc_ptr->proc~dtfft_get_error_string proc~mem_alloc_ptr->proc~mem_alloc_host proc~mem_alloc_ptr->proc~write_message proc~mem_alloc_ptr->is_null_ptr mem_alloc mem_alloc proc~mem_alloc_ptr->mem_alloc proc~mem_alloc_r32_1d->proc~mem_alloc_ptr proc~mem_alloc_r32_2d->proc~mem_alloc_ptr proc~mem_alloc_r32_3d->proc~mem_alloc_ptr proc~mem_alloc_r64_1d->proc~mem_alloc_ptr proc~mem_alloc_r64_2d->proc~mem_alloc_ptr proc~mem_alloc_r64_3d->proc~mem_alloc_ptr fftw_malloc fftw_malloc proc~mem_alloc~2->fftw_malloc proc~mem_alloc~3 mkl_executor%mem_alloc proc~mem_alloc~3->interface~mkl_dfti_mem_alloc proc~mem_alloc~3->interface~to_str proc~mem_alloc~3->proc~dftierrormessage proc~mem_alloc~3->mpi_abort proc~mem_alloc~4 cufft_executor%mem_alloc proc~mem_alloc~5 vkfft_executor%mem_alloc proc~mem_free->proc~free_mem proc~mem_free_c32_1d->proc~mem_free_ptr proc~mem_free_c32_2d->proc~mem_free_ptr proc~mem_free_c32_3d->proc~mem_free_ptr proc~mem_free_c64_1d->proc~mem_free_ptr proc~mem_free_c64_2d->proc~mem_free_ptr proc~mem_free_c64_3d->proc~mem_free_ptr proc~mem_free_ptr->interface~mem_free_host proc~mem_free_ptr->proc~dtfft_get_error_string proc~mem_free_ptr->proc~write_message mem_free mem_free proc~mem_free_ptr->mem_free proc~mem_free_r32_1d->proc~mem_free_ptr proc~mem_free_r32_2d->proc~mem_free_ptr proc~mem_free_r32_3d->proc~mem_free_ptr proc~mem_free_r64_1d->proc~mem_free_ptr proc~mem_free_r64_2d->proc~mem_free_ptr proc~mem_free_r64_3d->proc~mem_free_ptr fftw_free fftw_free proc~mem_free~2->fftw_free proc~mem_free~3 mkl_executor%mem_free proc~mem_free~3->interface~mkl_dfti_mem_free proc~mem_free~3->interface~to_str proc~mem_free~3->proc~dftierrormessage proc~mem_free~3->mpi_abort proc~mem_free~4 cufft_executor%mem_free proc~mem_free~5 vkfft_executor%mem_free proc~ncclgeterrorstring->interface~ncclgeterrorstring_c proc~ncclgeterrorstring->proc~string_c2f proc~nvrtcgeterrorstring->proc~string_c2f proc~permute_backward_end_read_f128->proc~permute_backward_end_pipelined_read_f128 proc~permute_backward_end_read_f128_block_16->proc~permute_backward_end_pipelined_read_f128_block_16 proc~permute_backward_end_read_f128_block_32->proc~permute_backward_end_pipelined_read_f128_block_32 proc~permute_backward_end_read_f128_block_64->proc~permute_backward_end_pipelined_read_f128_block_64 proc~permute_backward_end_read_f32->proc~permute_backward_end_pipelined_read_f32 proc~permute_backward_end_read_f32_block_16->proc~permute_backward_end_pipelined_read_f32_block_16 proc~permute_backward_end_read_f32_block_32->proc~permute_backward_end_pipelined_read_f32_block_32 proc~permute_backward_end_read_f32_block_64->proc~permute_backward_end_pipelined_read_f32_block_64 proc~permute_backward_end_read_f64->proc~permute_backward_end_pipelined_read_f64 proc~permute_backward_end_read_f64_block_16->proc~permute_backward_end_pipelined_read_f64_block_16 proc~permute_backward_end_read_f64_block_32->proc~permute_backward_end_pipelined_read_f64_block_32 proc~permute_backward_end_read_f64_block_64->proc~permute_backward_end_pipelined_read_f64_block_64 proc~permute_backward_end_write_f128->proc~permute_backward_end_pipelined_write_f128 proc~permute_backward_end_write_f128_block_16->proc~permute_backward_end_pipelined_write_f128_block_16 proc~permute_backward_end_write_f128_block_32->proc~permute_backward_end_pipelined_write_f128_block_32 proc~permute_backward_end_write_f128_block_64->proc~permute_backward_end_pipelined_write_f128_block_64 proc~permute_backward_end_write_f32->proc~permute_backward_end_pipelined_write_f32 proc~permute_backward_end_write_f32_block_16->proc~permute_backward_end_pipelined_write_f32_block_16 proc~permute_backward_end_write_f32_block_32->proc~permute_backward_end_pipelined_write_f32_block_32 proc~permute_backward_end_write_f32_block_64->proc~permute_backward_end_pipelined_write_f32_block_64 proc~permute_backward_end_write_f64->proc~permute_backward_end_pipelined_write_f64 proc~permute_backward_end_write_f64_block_16->proc~permute_backward_end_pipelined_write_f64_block_16 proc~permute_backward_end_write_f64_block_32->proc~permute_backward_end_pipelined_write_f64_block_32 proc~permute_backward_end_write_f64_block_64->proc~permute_backward_end_pipelined_write_f64_block_64 proc~pop_nvtx_domain_range->interface~nvtxdomainrangepop_c proc~push_nvtx_domain_range->interface~nvtxdomainrangepushex_c proc~push_nvtx_domain_range->proc~astring_f2c proc~push_nvtx_domain_range->proc~create_nvtx_domain proc~report->interface~to_str proc~report->proc~dtfft_get_backend_string proc~report->proc~dtfft_get_error_string proc~report->proc~dtfft_get_executor_string proc~report->proc~dtfft_get_precision_string proc~report->proc~get_backend proc~report->proc~write_message proc~report->mpi_comm_size proc~report_timings->interface~to_str proc~report_timings->proc~get_conf_log_enabled proc~report_timings->proc~write_message proc~report_timings->mpi_allreduce proc~report_timings->mpi_comm_size proc~run_autotune_backend->interface~cudaeventcreate proc~run_autotune_backend->interface~cudaeventdestroy proc~run_autotune_backend->interface~cudaeventelapsedtime proc~run_autotune_backend->interface~cudaeventrecord proc~run_autotune_backend->interface~cudaeventsynchronize proc~run_autotune_backend->interface~cudastreamsynchronize proc~run_autotune_backend->proc~alloc_and_set_aux proc~run_autotune_backend->proc~alloc_mem proc~run_autotune_backend->proc~allocate_plans proc~run_autotune_backend->proc~create_helper~2 proc~run_autotune_backend->proc~cudageterrorstring proc~run_autotune_backend->proc~destroy_helper~2 proc~run_autotune_backend->proc~destroy_plans proc~run_autotune_backend->proc~dtfft_get_backend_string proc~run_autotune_backend->proc~dtfft_get_error_string proc~run_autotune_backend->proc~free_mem proc~run_autotune_backend->proc~get_conf_datatype_enabled proc~run_autotune_backend->proc~get_conf_log_enabled proc~run_autotune_backend->proc~get_conf_measure_iters proc~run_autotune_backend->proc~get_conf_measure_warmup_iters proc~run_autotune_backend->proc~get_conf_mpi_enabled proc~run_autotune_backend->proc~get_conf_nvshmem_enabled proc~run_autotune_backend->proc~get_conf_pipelined_enabled proc~run_autotune_backend->proc~get_local_sizes proc~run_autotune_backend->proc~is_backend_mpi proc~run_autotune_backend->proc~is_backend_nccl proc~run_autotune_backend->proc~is_backend_nvshmem proc~run_autotune_backend->proc~is_backend_pipelined proc~run_autotune_backend->proc~pop_nvtx_domain_range proc~run_autotune_backend->proc~push_nvtx_domain_range proc~run_autotune_backend->proc~report_timings proc~run_autotune_datatypes run_autotune_datatypes proc~run_autotune_backend->proc~run_autotune_datatypes proc~run_autotune_backend->proc~write_message proc~run_autotune_backend->execute proc~run_autotune_backend->fname proc~run_autotune_backend->mpi_abort proc~run_autotune_backend->mpi_barrier proc~run_autotune_backend->mpi_comm_size proc~run_autotune_backend->mpi_wtime proc~run_autotune_datatypes->interface~to_str proc~run_autotune_datatypes->proc~autotune_transpose_id proc~run_autotune_datatypes->proc~get_conf_log_enabled proc~run_autotune_datatypes->proc~write_message proc~run_permute_backward run_permute_backward proc~run_permute_backward->proc~compare proc~run_permute_backward->proc~permute_backward_read_f32 proc~run_permute_backward->proc~permute_backward_read_f32_block_16 proc~run_permute_backward->proc~permute_backward_read_f32_block_32 proc~run_permute_backward->proc~permute_backward_read_f32_block_64 proc~run_permute_backward->proc~permute_backward_write_f32 proc~run_permute_backward->proc~permute_backward_write_f32_block_16 proc~run_permute_backward->proc~permute_backward_write_f32_block_32 proc~run_permute_backward->proc~permute_backward_write_f32_block_64 proc~run_permute_backward_end run_permute_backward_end proc~run_permute_backward_end->proc~compare proc~run_permute_backward_end->proc~permute_backward_end_pipelined_read_f32_block_16 proc~run_permute_backward_end->proc~permute_backward_end_pipelined_read_f32_block_32 proc~run_permute_backward_end->proc~permute_backward_end_pipelined_read_f32_block_64 proc~run_permute_backward_end->proc~permute_backward_end_pipelined_write_f32 proc~run_permute_backward_end->proc~permute_backward_end_pipelined_write_f32_block_16 proc~run_permute_backward_end->proc~permute_backward_end_pipelined_write_f32_block_32 proc~run_permute_backward_end->proc~permute_backward_end_pipelined_write_f32_block_64 proc~run_permute_backward_start run_permute_backward_start proc~run_permute_backward_start->proc~compare proc~run_permute_backward_start->proc~permute_backward_start_read_f32 proc~run_permute_backward_start->proc~permute_backward_start_read_f32_block_16 proc~run_permute_backward_start->proc~permute_backward_start_read_f32_block_32 proc~run_permute_backward_start->proc~permute_backward_start_read_f32_block_64 proc~run_permute_backward_start->proc~permute_backward_start_write_f32 proc~run_permute_backward_start->proc~permute_backward_start_write_f32_block_16 proc~run_permute_backward_start->proc~permute_backward_start_write_f32_block_32 proc~run_permute_backward_start->proc~permute_backward_start_write_f32_block_64 proc~run_permute_forward run_permute_forward proc~run_permute_forward->proc~compare proc~run_permute_forward->proc~permute_forward_read_f32 proc~run_permute_forward->proc~permute_forward_read_f32_block_16 proc~run_permute_forward->proc~permute_forward_read_f32_block_32 proc~run_permute_forward->proc~permute_forward_read_f32_block_64 proc~run_permute_forward->proc~permute_forward_write_f32 proc~run_permute_forward->proc~permute_forward_write_f32_block_16 proc~run_permute_forward->proc~permute_forward_write_f32_block_32 proc~run_permute_forward->proc~permute_forward_write_f32_block_64 proc~run_unpack run_unpack proc~run_unpack->proc~compare proc~run_unpack->proc~unpack_pipelined_f32 proc~run_unpack->proc~unpack_pipelined_f32_block_16 proc~run_unpack->proc~unpack_pipelined_f32_block_32 proc~run_unpack->proc~unpack_pipelined_f32_block_64 proc~select_access_mode_f128->interface~to_str proc~select_access_mode_f128->proc~execute_benchmark proc~select_access_mode_f128->proc~get_conf_log_enabled proc~select_access_mode_f128->proc~pop_nvtx_domain_range proc~select_access_mode_f128->proc~push_nvtx_domain_range proc~select_access_mode_f128->proc~write_message proc~select_access_mode_f32->interface~to_str proc~select_access_mode_f32->proc~execute_benchmark proc~select_access_mode_f32->proc~get_conf_log_enabled proc~select_access_mode_f32->proc~pop_nvtx_domain_range proc~select_access_mode_f32->proc~push_nvtx_domain_range proc~select_access_mode_f32->proc~write_message proc~select_access_mode_f64->interface~to_str proc~select_access_mode_f64->proc~execute_benchmark proc~select_access_mode_f64->proc~get_conf_log_enabled proc~select_access_mode_f64->proc~pop_nvtx_domain_range proc~select_access_mode_f64->proc~push_nvtx_domain_range proc~select_access_mode_f64->proc~write_message proc~set_name_expression->proc~get_name_expression proc~set_name_expression->proc~nvrtcgeterrorstring proc~set_name_expression->fname proc~set_name_expression->mpi_abort proc~string_c2f->interface~is_null_ptr proc~transpose dtfft_plan_t%transpose proc~transpose->proc~transpose_ptr proc~transpose_end->proc~execute_end proc~transpose_end->proc~pop_nvtx_domain_range proc~transpose_end->proc~push_nvtx_domain_range proc~transpose_private dtfft_plan_t%transpose_private proc~transpose_private->proc~execute proc~transpose_ptr->proc~pop_nvtx_domain_range proc~transpose_ptr->proc~push_nvtx_domain_range proc~transpose_ptr->proc~transpose_private proc~transpose_start dtfft_plan_t%transpose_start proc~transpose_start->proc~transpose_start_ptr proc~transpose_start_ptr->proc~pop_nvtx_domain_range proc~transpose_start_ptr->proc~push_nvtx_domain_range proc~transpose_start_ptr->proc~transpose_private proc~unload_library->interface~dlclose proc~unload_library->proc~dl_error proc~unpack_f128->proc~unpack_pipelined_f128 proc~unpack_f128_block_16->proc~unpack_pipelined_f128_block_16 proc~unpack_f128_block_32->proc~unpack_pipelined_f128_block_32 proc~unpack_f128_block_64->proc~unpack_pipelined_f128_block_64 proc~unpack_f32->proc~unpack_pipelined_f32 proc~unpack_f32_block_16->proc~unpack_pipelined_f32_block_16 proc~unpack_f32_block_32->proc~unpack_pipelined_f32_block_32 proc~unpack_f32_block_64->proc~unpack_pipelined_f32_block_64 proc~unpack_f64->proc~unpack_pipelined_f64 proc~unpack_f64_block_16->proc~unpack_pipelined_f64_block_16 proc~unpack_f64_block_32->proc~unpack_pipelined_f64_block_32 proc~unpack_f64_block_64->proc~unpack_pipelined_f64_block_64 proc~write_message->mpi_comm_rank proc~write_message->mpi_finalized program~test_host_kernels test_host_kernels program~test_host_kernels->proc~run_permute_backward program~test_host_kernels->proc~run_permute_backward_end program~test_host_kernels->proc~run_permute_backward_start program~test_host_kernels->proc~run_permute_forward program~test_host_kernels->proc~run_unpack
Help